handle long literal constants faster

2025-08-03 00:45:53 +08:00 · 2022-04-24 02:50:09 -04:00
parent 464801bcb3
commit 371138cfbc
10 changed files with 2061 additions and 63 deletions
--- a/test/simple_source/expression/05_long_list.py
+++ b/test/simple_source/expression/05_long_list.py
@@ -1,3 +0,0 @@
-# Long lists pose a slowdown in uncompiling.
-x = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-print(x)
--- a/test/simple_source/expression/05_long_literals.py
+++ b/test/simple_source/expression/05_long_literals.py
--- a/uncompyle6/parser.py
+++ b/uncompyle6/parser.py
@@ -56,6 +56,7 @@ class PythonParser(GenericASTBuilder):
            "_come_froms",
            "_stmts",
            "attributes",
+            "add_consts",
            "come_froms",
            "except_stmts",
            "exprlist",
--- a/uncompyle6/parsers/parse37base.py
+++ b/uncompyle6/parsers/parse37base.py
@@ -319,6 +319,22 @@ class Python37BaseParser(PythonParser):
                    """
                self.addRule(rules_str, nop_func)

+            elif opname in ("BUILD_CONST_LIST", "BUILD_CONST_DICT", "BUILD_CONST_SET"):
+                if opname == "BUILD_CONST_DICT":
+                    rule = f"""
+                            add_consts          ::= ADD_VALUE*
+                            const_list          ::= COLLECTION_START add_consts {opname}
+                            dict                ::= const_list
+                            expr                ::= dict
+                        """
+                else:
+                    rule = f"""
+                            add_consts          ::= ADD_VALUE*
+                            const_list          ::= COLLECTION_START add_consts {opname}
+                            expr                ::= const_list
+                        """
+                self.addRule(rule, nop_func)
+
            elif opname_base == "BUILD_CONST_KEY_MAP":
                kvlist_n = "expr " * (token.attr)
                rule = "dict ::= %sLOAD_CONST %s" % (kvlist_n, opname)
--- a/uncompyle6/scanners/scanner37.py
+++ b/uncompyle6/scanners/scanner37.py
@@ -22,6 +22,7 @@ This sets up opcodes Python's 3.7 and calls a generalized
 scanner routine for Python 3.
 """

+from typing import Tuple
 from uncompyle6.scanners.scanner37base import Scanner37Base

 # bytecode verification, verify(), uses JUMP_OPs from here
@@ -30,6 +31,8 @@ from xdis.opcodes import opcode_37 as opc
 # bytecode verification, verify(), uses JUMP_OPS from here
 JUMP_OPs = opc.JUMP_OPS

+CONST_COLLECTIONS = ("CONST_LIST", "CONST_SET", "CONST_DICT")
+

 class Scanner37(Scanner37Base):
    def __init__(self, show_asm=None, is_pypy: bool=False):
@@ -39,9 +42,28 @@ class Scanner37(Scanner37Base):

    pass

-    def ingest(self, co, classname=None, code_objects={}, show_asm=None):
+    def ingest(
+        self, co, classname=None, code_objects={}, show_asm=None
+    ) -> Tuple[list, dict]:
        tokens, customize = Scanner37Base.ingest(self, co, classname, code_objects, show_asm)
-        for t in tokens:
+        new_tokens = []
+        for i, t in enumerate(tokens):
+            # things that smash new_tokens like BUILD_LIST have to come first.
+            if t.op in (
+                self.opc.BUILD_CONST_KEY_MAP,
+                self.opc.BUILD_LIST,
+                self.opc.BUILD_SET,
+            ):
+                collection_type = (
+                    "DICT"
+                    if t.kind.startswith("BUILD_CONST_KEY_MAP")
+                    else t.kind.split("_")[1]
+                )
+                new_tokens = self.bound_collection(
+                    tokens, new_tokens, t, i, f"CONST_{collection_type}"
+                )
+                continue
+
            # The lowest bit of flags indicates whether the
            # var-keyword argument is placed at the top of the stack
            if t.op == self.opc.CALL_FUNCTION_EX and t.attr & 1:
@@ -59,8 +81,9 @@ class Scanner37(Scanner37Base):
                t.kind = "BUILD_MAP_UNPACK_WITH_CALL_%d" % t.attr
            elif not self.is_pypy and t.op == self.opc.BUILD_TUPLE_UNPACK_WITH_CALL:
                t.kind = "BUILD_TUPLE_UNPACK_WITH_CALL_%d" % t.attr
-            pass
-        return tokens, customize
+            new_tokens.append(t)
+
+        return new_tokens, customize

 if __name__ == "__main__":
    from xdis.version_info import PYTHON_VERSION_TRIPLE, version_tuple_to_str
--- a/uncompyle6/scanners/scanner37base.py
+++ b/uncompyle6/scanners/scanner37base.py
@@ -1,4 +1,4 @@
-#  Copyright (c) 2015-2020 by Rocky Bernstein
+#  Copyright (c) 2015-2020, 2022 by Rocky Bernstein
 #  Copyright (c) 2005 by Dan Pascu <dan@windowmaker.org>
 #  Copyright (c) 2000-2002 by hartmut Goebel <h.goebel@crazy-compilers.com>
 #
@@ -29,6 +29,8 @@ For example:
 Finally we save token information.
 """

+from typing import Any, Dict, List, Set
+
 from xdis import iscode, instruction_size, Instruction
 from xdis.bytecode import _get_const_info

@@ -45,6 +47,9 @@ import sys
 globals().update(op3.opmap)


+CONST_COLLECTIONS = ("CONST_LIST", "CONST_SET", "CONST_DICT")
+
+
 class Scanner37Base(Scanner):
    def __init__(self, version, show_asm=None, is_pypy=False):
        super(Scanner37Base, self).__init__(version, show_asm, is_pypy)
@@ -179,6 +184,80 @@ class Scanner37Base(Scanner):
        # self.varargs_ops = frozenset(self.opc.hasvargs)
        return

+    def bound_collection(
+        self, tokens: list, next_tokens: list, t: Token, i: int, collection_type: str
+    ):
+        count = t.attr
+        assert isinstance(count, int)
+
+        assert count <= i
+
+        if collection_type == "CONST_DICT":
+            # constant dictonaries work via BUILD_CONST_KEY_MAP and
+            # handle the values() like sets and lists.
+            # However the keys() are an LOAD_CONST of the keys.
+            # adjust offset to account for this
+            count += 1
+
+        # For small lists don't bother
+        if count < 5:
+            return next_tokens + [t]
+
+        collection_start = i - count
+
+        for j in range(collection_start, i):
+            if tokens[j].kind not in (
+                "LOAD_CONST",
+                "LOAD_FAST",
+                "LOAD_GLOBAL",
+                "LOAD_NAME",
+            ):
+                return next_tokens + [t]
+
+        collection_enum = CONST_COLLECTIONS.index(collection_type)
+
+        # If we go there all instructions before tokens[i] are LOAD_CONST and we can replace
+        # add a boundary marker and change LOAD_CONST to something else
+        new_tokens = next_tokens[:-count]
+        start_offset = tokens[collection_start].offset
+        new_tokens.append(
+            Token(
+                opname="COLLECTION_START",
+                attr=collection_enum,
+                pattr=collection_type,
+                offset=f"{start_offset}_0",
+                has_arg=True,
+                opc=self.opc,
+                has_extended_arg=False,
+            )
+        )
+        for j in range(collection_start, i):
+            new_tokens.append(
+                Token(
+                    opname="ADD_VALUE",
+                    attr=tokens[j].attr,
+                    pattr=tokens[j].pattr,
+                    offset=tokens[j].offset,
+                    has_arg=True,
+                    linestart=tokens[j].linestart,
+                    opc=self.opc,
+                    has_extended_arg=False,
+                )
+            )
+        new_tokens.append(
+            Token(
+                opname=f"BUILD_{collection_type}",
+                attr=t.attr,
+                pattr=t.pattr,
+                offset=t.offset,
+                has_arg=t.has_arg,
+                linestart=t.linestart,
+                opc=t.opc,
+                has_extended_arg=False,
+            )
+        )
+        return new_tokens
+
    def ingest(self, co, classname=None, code_objects={}, show_asm=None):
        """
        Pick out tokens from an uncompyle6 code object, and transform them,
@@ -212,7 +291,7 @@ class Scanner37Base(Scanner):
        # show_asm = 'both'
        if show_asm in ("both", "before"):
            for instr in bytecode.get_instructions(co):
-                print(instr.disassemble())
+                print(instr.disassemble(self.opc))

        # "customize" is in the process of going away here
        customize = {}
@@ -316,6 +395,7 @@ class Scanner37Base(Scanner):
                # "loop" tag last so the grammar rule matches that properly.
                for jump_offset in sorted(jump_targets[inst.offset], reverse=True):
                    come_from_name = "COME_FROM"
+
                    opname = self.opname_for_offset(jump_offset)
                    if opname == "EXTENDED_ARG":
                        k = xdis.next_offset(op, self.opc, jump_offset)
@@ -342,22 +422,6 @@ class Scanner37Base(Scanner):
                    jump_idx += 1
                    pass
                pass
-            elif inst.offset in self.else_start:
-                end_offset = self.else_start[inst.offset]
-                j = tokens_append(
-                    j,
-                    Token(
-                        "ELSE",
-                        None,
-                        repr(end_offset),
-                        offset="%s" % (inst.offset),
-                        has_arg=True,
-                        opc=self.opc,
-                        has_extended_arg=inst.has_extended_arg,
-                    ),
-                )
-
-                pass

            pattr = inst.argrepr
            opname = inst.opname
@@ -444,17 +508,24 @@ class Scanner37Base(Scanner):
                opname = "%s_%d+%d" % (opname, before_args, after_args)

            elif op == self.opc.JUMP_ABSOLUTE:
-                # Further classify JUMP_ABSOLUTE into backward jumps
-                # which are used in loops, and "CONTINUE" jumps which
-                # may appear in a "continue" statement.  The loop-type
-                # and continue-type jumps will help us classify loop
-                # boundaries The continue-type jumps help us get
-                # "continue" statements with would otherwise be turned
-                # into a "pass" statement because JUMPs are sometimes
-                # ignored in rules as just boundary overhead. In
-                # comprehensions we might sometimes classify JUMP_BACK
-                # as CONTINUE, but that's okay since we add a grammar
-                # rule for that.
+                #  Refine JUMP_ABSOLUTE further in into:
+                #
+                # * "JUMP_LOOP"    - which are are used in loops. This is sometimes
+                #                   found at the end of a looping construct
+                # * "BREAK_LOOP"  - which are are used to break loops.
+                # * "CONTINUE"    - jumps which may appear in a "continue" statement.
+                #                   It is okay to confuse this with JUMP_LOOP. The
+                #                   grammar should tolerate this.
+                # * "JUMP_FORWARD - forward jumps that are not BREAK_LOOP jumps.
+                #
+                # The loop-type and continue-type jumps will help us
+                # classify loop boundaries The continue-type jumps
+                # help us get "continue" statements with would
+                # otherwise be turned into a "pass" statement because
+                # JUMPs are sometimes ignored in rules as just
+                # boundary overhead. Again, in comprehensions we might
+                # sometimes classify JUMP_LOOP as CONTINUE, but that's
+                # okay since grammar rules should tolerate that.
                pattr = argval
                target = inst.argval
                if target <= inst.offset:
@@ -523,7 +594,7 @@ class Scanner37Base(Scanner):
            print()
        return tokens, customize

-    def find_jump_targets(self, debug):
+    def find_jump_targets(self, debug: str) -> dict:
        """
        Detect all offsets in a byte code which are jump targets
        where we might insert a COME_FROM instruction.
@@ -538,18 +609,17 @@ class Scanner37Base(Scanner):
        self.structs = [{"type": "root", "start": 0, "end": n - 1}]

        # All loop entry points
-        self.loops = []
+        self.loops: List[int] = []

        # Map fixed jumps to their real destination
-        self.fixed_jumps = {}
+        self.fixed_jumps: Dict[int, int] = {}
        self.except_targets = {}
-        self.ignore_if = set()
+        self.ignore_if: Set[int] = set()
        self.build_statement_indices()
-        self.else_start = {}

        # Containers filled by detect_control_flow()
-        self.not_continue = set()
-        self.return_end_ifs = set()
+        self.not_continue: Set[int] = set()
+        self.return_end_ifs: Set[int] = set()
        self.setup_loop_targets = {}  # target given setup_loop offset
        self.setup_loops = {}  # setup_loop offset given target

@@ -655,9 +725,9 @@ class Scanner37Base(Scanner):
                ):
                    stmts.remove(stmt_offset)
                    continue
-                # Rewing ops till we encounter non-JUMP_ABSOLUTE one
+                # Scan back bytecode ops till we encounter non-JUMP_ABSOLUTE op
                j = self.prev_op[stmt_offset]
-                while code[j] == self.opc.JUMP_ABSOLUTE:
+                while code[j] == self.opc.JUMP_ABSOLUTE and j > 0:
                    j = self.prev_op[j]
                # If we got here, then it's list comprehension which
                # is not a statement too
@@ -687,7 +757,9 @@ class Scanner37Base(Scanner):
        # Finish filling the list for last statement
        slist += [codelen] * (codelen - len(slist))

-    def detect_control_flow(self, offset, targets, inst_index):
+    def detect_control_flow(
+        self, offset: int, targets: Dict[Any, Any], inst_index: int
+    ):
        """
        Detect type of block structures and their boundaries to fix optimized jumps
        in python2.3+
@@ -698,9 +770,9 @@ class Scanner37Base(Scanner):
        op = inst.opcode

        # Detect parent structure
-        parent = self.structs[0]
-        start = parent["start"]
-        end = parent["end"]
+        parent: Dict[str, Any] = self.structs[0]
+        start: int = parent["start"]
+        end: int = parent["end"]

        # Pick inner-most parent for our offset
        for struct in self.structs:
@@ -933,20 +1005,16 @@ class Scanner37Base(Scanner):


 if __name__ == "__main__":
-    from uncompyle6 import PYTHON_VERSION
+    from xdis.version_info import PYTHON_VERSION_TRIPLE, version_tuple_to_str

-    if PYTHON_VERSION >= 3.7:
+    if PYTHON_VERSION_TRIPLE[:2] == (3, 7):
        import inspect

-        co = inspect.currentframe().f_code
-        from uncompyle6 import PYTHON_VERSION
+        co = inspect.currentframe().f_code  # type: ignore

-        tokens, customize = Scanner37Base(PYTHON_VERSION).ingest(co)
+        tokens, customize = Scanner37Base(PYTHON_VERSION_TRIPLE).ingest(co)
        for t in tokens:
            print(t)
    else:
-        print(
-            "Need to be Python 3.7 or greater to demo; I am version {PYTHON_VERSION}."
-            % PYTHON_VERSION
-        )
+        print(f"Need to be Python 3.7 to demo; I am version {version_tuple_to_str()}.")
    pass
--- a/uncompyle6/scanners/scanner38.py
+++ b/uncompyle6/scanners/scanner38.py
@@ -62,6 +62,8 @@ class Scanner38(Scanner37):
            print(jump_back_targets)
        loop_ends = []
        next_end = tokens[len(tokens) - 1].off2int() + 10
+
+        new_tokens = []
        for i, token in enumerate(tokens):
            opname = token.kind
            offset = token.offset
@@ -76,6 +78,8 @@ class Scanner38(Scanner37):
                    else tokens[len(tokens) - 1].off2int() + 10
                )

+            # things that smash new_tokens like BUILD_LIST have to come first.
+
            if offset in jump_back_targets:
                next_end = off2int(jump_back_targets[offset], prefer_last=False)
                if self.debug:
@@ -93,6 +97,7 @@ class Scanner38(Scanner37):
                if opname == "JUMP_ABSOLUTE" and jump_target <= next_end:
                    # Not a forward-enough jump to break out of the next loop, so continue.
                    # FIXME: Do we need "continue" detection?
+                    new_tokens.append(token)
                    continue

                # We also want to avoid confusing BREAK_LOOPS with parts of the
@@ -123,8 +128,8 @@ class Scanner38(Scanner37):
                ):
                    token.kind = "BREAK_LOOP"
                pass
-            pass
-        return tokens, customize
+            new_tokens.append(token)
+        return new_tokens, customize


 if __name__ == "__main__":
--- a/uncompyle6/semantics/consts.py
+++ b/uncompyle6/semantics/consts.py
@@ -282,6 +282,7 @@ TABLE_DIRECT = {
    "comp_if": (" if %c%c", 0, 2),
    "comp_if_not": (" if not %p%c", (0, "expr", PRECEDENCE["unary_not"]), 2),
    "comp_body": ("",),  # ignore when recusing
+
    "set_comp_body": ("%c", 0),
    "gen_comp_body": ("%c", 0),
    "dict_comp_body": ("%c:%c", 1, 0),
--- a/uncompyle6/semantics/make_function36.py
+++ b/uncompyle6/semantics/make_function36.py
@@ -277,8 +277,16 @@ def make_function36(self, node, is_lambda, nested=1, code_node=None):
        # FIXME: handle free_tup, ann_dict, and default_tup
        if kw_dict:
            assert kw_dict == "dict"
+            const_list = kw_dict[0]
+            if kw_dict[0] == "const_list":
+                add_consts = const_list[1]
+                assert add_consts == "add_consts"
+                names = add_consts[-1].attr
+                defaults = [v.pattr for v in add_consts[:-1]]
+            else:
                defaults = [self.traverse(n, indent="") for n in kw_dict[:-2]]
                names = eval(self.traverse(kw_dict[-2]))
+
            assert len(defaults) == len(names)
            # FIXME: possibly handle line breaks
            for i, n in enumerate(names):
--- a/uncompyle6/semantics/n_actions.py
+++ b/uncompyle6/semantics/n_actions.py
@@ -202,6 +202,68 @@ class NonterminalActions:

    n_classdefdeco2 = n_classdef

+    def n_const_list(self, node):
+        """
+        prettyprint a constant dict, list, set or tuple.
+        """
+        p = self.prec
+
+        lastnodetype = node[2].kind
+        flat_elems = node[1]
+        is_dict = lastnodetype.endswith("DICT")
+
+        if lastnodetype.endswith("LIST"):
+            self.write("[")
+            endchar = "]"
+        elif lastnodetype.endswith("SET") or is_dict:
+            self.write("{")
+            endchar = "}"
+        else:
+            # from trepan.api import debug; debug()
+            raise TypeError(
+                f"Internal Error: n_const_list expects dict, list set, or set; got {lastnodetype}"
+            )
+
+        self.indent_more(INDENT_PER_LEVEL)
+        sep = ""
+        if is_dict:
+            keys = flat_elems[-1].pattr
+            assert isinstance(keys, tuple)
+            assert len(keys) == len(flat_elems) - 1
+            for i, elem in enumerate(flat_elems[:-1]):
+                assert elem.kind == "ADD_VALUE"
+                value = elem.pattr
+                if elem.linestart is not None:
+                    if elem.linestart != self.line_number:
+                        sep += "\n" + self.indent + INDENT_PER_LEVEL[:-1]
+                        self.line_number = elem.linestart
+                    else:
+                        if sep != "":
+                            sep += " "
+                self.write(f"{sep} {repr(keys[i])}: {value}")
+                sep = ","
+        else:
+            for elem in flat_elems:
+                if elem.kind != "ADD_VALUE":
+                    from trepan.api import debug; debug()
+                assert elem.kind == "ADD_VALUE"
+                value = elem.pattr
+                if elem.linestart is not None:
+                    if elem.linestart != self.line_number:
+                        sep += "\n" + self.indent + INDENT_PER_LEVEL[:-1]
+                        self.line_number = elem.linestart
+                    else:
+                        if sep != "":
+                            sep += " "
+                self.write(sep, value)
+                sep = ","
+        self.write(endchar)
+        self.indent_less(INDENT_PER_LEVEL)
+
+        self.prec = p
+        self.prune()
+        return
+
    def n_delete_subscript(self, node):
        if node[-2][0] == "build_list" and node[-2][0][-1].kind.startswith(
            "BUILD_TUPLE"
@@ -498,6 +560,11 @@ class NonterminalActions:
        """
        prettyprint a dict, list, set or tuple.
        """
+        if len(node) == 1 and node[0] == "const_list":
+            self.preorder(node[0])
+            self.prune()
+            return
+
        p = self.prec
        self.prec = PRECEDENCE["yield"] - 1
        lastnode = node.pop()
@@ -547,7 +614,6 @@ class NonterminalActions:
            self.write("(")
            endchar = ")"
        else:
-            # from trepan.api import debug; debug()
            raise TypeError(
                "Internal Error: n_build_list expects list, tuple, set, or unpack"
            )