handle long literal constants faster

2025-08-02 16:44:46 +08:00 · 2022-04-24 02:50:09 -04:00
parent 464801bcb3
commit 371138cfbc
10 changed files with 2061 additions and 63 deletions
--- a/uncompyle6/scanners/scanner37base.py
+++ b/uncompyle6/scanners/scanner37base.py
@@ -1,4 +1,4 @@
-#  Copyright (c) 2015-2020 by Rocky Bernstein
+#  Copyright (c) 2015-2020, 2022 by Rocky Bernstein
 #  Copyright (c) 2005 by Dan Pascu <dan@windowmaker.org>
 #  Copyright (c) 2000-2002 by hartmut Goebel <h.goebel@crazy-compilers.com>
 #
@@ -29,6 +29,8 @@ For example:
 Finally we save token information.
 """

+from typing import Any, Dict, List, Set
+
 from xdis import iscode, instruction_size, Instruction
 from xdis.bytecode import _get_const_info

@@ -45,6 +47,9 @@ import sys
 globals().update(op3.opmap)


+CONST_COLLECTIONS = ("CONST_LIST", "CONST_SET", "CONST_DICT")
+
+
 class Scanner37Base(Scanner):
    def __init__(self, version, show_asm=None, is_pypy=False):
        super(Scanner37Base, self).__init__(version, show_asm, is_pypy)
@@ -179,6 +184,80 @@ class Scanner37Base(Scanner):
        # self.varargs_ops = frozenset(self.opc.hasvargs)
        return

+    def bound_collection(
+        self, tokens: list, next_tokens: list, t: Token, i: int, collection_type: str
+    ):
+        count = t.attr
+        assert isinstance(count, int)
+
+        assert count <= i
+
+        if collection_type == "CONST_DICT":
+            # constant dictonaries work via BUILD_CONST_KEY_MAP and
+            # handle the values() like sets and lists.
+            # However the keys() are an LOAD_CONST of the keys.
+            # adjust offset to account for this
+            count += 1
+
+        # For small lists don't bother
+        if count < 5:
+            return next_tokens + [t]
+
+        collection_start = i - count
+
+        for j in range(collection_start, i):
+            if tokens[j].kind not in (
+                "LOAD_CONST",
+                "LOAD_FAST",
+                "LOAD_GLOBAL",
+                "LOAD_NAME",
+            ):
+                return next_tokens + [t]
+
+        collection_enum = CONST_COLLECTIONS.index(collection_type)
+
+        # If we go there all instructions before tokens[i] are LOAD_CONST and we can replace
+        # add a boundary marker and change LOAD_CONST to something else
+        new_tokens = next_tokens[:-count]
+        start_offset = tokens[collection_start].offset
+        new_tokens.append(
+            Token(
+                opname="COLLECTION_START",
+                attr=collection_enum,
+                pattr=collection_type,
+                offset=f"{start_offset}_0",
+                has_arg=True,
+                opc=self.opc,
+                has_extended_arg=False,
+            )
+        )
+        for j in range(collection_start, i):
+            new_tokens.append(
+                Token(
+                    opname="ADD_VALUE",
+                    attr=tokens[j].attr,
+                    pattr=tokens[j].pattr,
+                    offset=tokens[j].offset,
+                    has_arg=True,
+                    linestart=tokens[j].linestart,
+                    opc=self.opc,
+                    has_extended_arg=False,
+                )
+            )
+        new_tokens.append(
+            Token(
+                opname=f"BUILD_{collection_type}",
+                attr=t.attr,
+                pattr=t.pattr,
+                offset=t.offset,
+                has_arg=t.has_arg,
+                linestart=t.linestart,
+                opc=t.opc,
+                has_extended_arg=False,
+            )
+        )
+        return new_tokens
+
    def ingest(self, co, classname=None, code_objects={}, show_asm=None):
        """
        Pick out tokens from an uncompyle6 code object, and transform them,
@@ -212,7 +291,7 @@ class Scanner37Base(Scanner):
        # show_asm = 'both'
        if show_asm in ("both", "before"):
            for instr in bytecode.get_instructions(co):
-                print(instr.disassemble())
+                print(instr.disassemble(self.opc))

        # "customize" is in the process of going away here
        customize = {}
@@ -316,6 +395,7 @@ class Scanner37Base(Scanner):
                # "loop" tag last so the grammar rule matches that properly.
                for jump_offset in sorted(jump_targets[inst.offset], reverse=True):
                    come_from_name = "COME_FROM"
+
                    opname = self.opname_for_offset(jump_offset)
                    if opname == "EXTENDED_ARG":
                        k = xdis.next_offset(op, self.opc, jump_offset)
@@ -342,22 +422,6 @@ class Scanner37Base(Scanner):
                    jump_idx += 1
                    pass
                pass
-            elif inst.offset in self.else_start:
-                end_offset = self.else_start[inst.offset]
-                j = tokens_append(
-                    j,
-                    Token(
-                        "ELSE",
-                        None,
-                        repr(end_offset),
-                        offset="%s" % (inst.offset),
-                        has_arg=True,
-                        opc=self.opc,
-                        has_extended_arg=inst.has_extended_arg,
-                    ),
-                )
-
-                pass

            pattr = inst.argrepr
            opname = inst.opname
@@ -444,17 +508,24 @@ class Scanner37Base(Scanner):
                opname = "%s_%d+%d" % (opname, before_args, after_args)

            elif op == self.opc.JUMP_ABSOLUTE:
-                # Further classify JUMP_ABSOLUTE into backward jumps
-                # which are used in loops, and "CONTINUE" jumps which
-                # may appear in a "continue" statement.  The loop-type
-                # and continue-type jumps will help us classify loop
-                # boundaries The continue-type jumps help us get
-                # "continue" statements with would otherwise be turned
-                # into a "pass" statement because JUMPs are sometimes
-                # ignored in rules as just boundary overhead. In
-                # comprehensions we might sometimes classify JUMP_BACK
-                # as CONTINUE, but that's okay since we add a grammar
-                # rule for that.
+                #  Refine JUMP_ABSOLUTE further in into:
+                #
+                # * "JUMP_LOOP"    - which are are used in loops. This is sometimes
+                #                   found at the end of a looping construct
+                # * "BREAK_LOOP"  - which are are used to break loops.
+                # * "CONTINUE"    - jumps which may appear in a "continue" statement.
+                #                   It is okay to confuse this with JUMP_LOOP. The
+                #                   grammar should tolerate this.
+                # * "JUMP_FORWARD - forward jumps that are not BREAK_LOOP jumps.
+                #
+                # The loop-type and continue-type jumps will help us
+                # classify loop boundaries The continue-type jumps
+                # help us get "continue" statements with would
+                # otherwise be turned into a "pass" statement because
+                # JUMPs are sometimes ignored in rules as just
+                # boundary overhead. Again, in comprehensions we might
+                # sometimes classify JUMP_LOOP as CONTINUE, but that's
+                # okay since grammar rules should tolerate that.
                pattr = argval
                target = inst.argval
                if target <= inst.offset:
@@ -523,7 +594,7 @@ class Scanner37Base(Scanner):
            print()
        return tokens, customize

-    def find_jump_targets(self, debug):
+    def find_jump_targets(self, debug: str) -> dict:
        """
        Detect all offsets in a byte code which are jump targets
        where we might insert a COME_FROM instruction.
@@ -538,18 +609,17 @@ class Scanner37Base(Scanner):
        self.structs = [{"type": "root", "start": 0, "end": n - 1}]

        # All loop entry points
-        self.loops = []
+        self.loops: List[int] = []

        # Map fixed jumps to their real destination
-        self.fixed_jumps = {}
+        self.fixed_jumps: Dict[int, int] = {}
        self.except_targets = {}
-        self.ignore_if = set()
+        self.ignore_if: Set[int] = set()
        self.build_statement_indices()
-        self.else_start = {}

        # Containers filled by detect_control_flow()
-        self.not_continue = set()
-        self.return_end_ifs = set()
+        self.not_continue: Set[int] = set()
+        self.return_end_ifs: Set[int] = set()
        self.setup_loop_targets = {}  # target given setup_loop offset
        self.setup_loops = {}  # setup_loop offset given target

@@ -655,9 +725,9 @@ class Scanner37Base(Scanner):
                ):
                    stmts.remove(stmt_offset)
                    continue
-                # Rewing ops till we encounter non-JUMP_ABSOLUTE one
+                # Scan back bytecode ops till we encounter non-JUMP_ABSOLUTE op
                j = self.prev_op[stmt_offset]
-                while code[j] == self.opc.JUMP_ABSOLUTE:
+                while code[j] == self.opc.JUMP_ABSOLUTE and j > 0:
                    j = self.prev_op[j]
                # If we got here, then it's list comprehension which
                # is not a statement too
@@ -687,7 +757,9 @@ class Scanner37Base(Scanner):
        # Finish filling the list for last statement
        slist += [codelen] * (codelen - len(slist))

-    def detect_control_flow(self, offset, targets, inst_index):
+    def detect_control_flow(
+        self, offset: int, targets: Dict[Any, Any], inst_index: int
+    ):
        """
        Detect type of block structures and their boundaries to fix optimized jumps
        in python2.3+
@@ -698,9 +770,9 @@ class Scanner37Base(Scanner):
        op = inst.opcode

        # Detect parent structure
-        parent = self.structs[0]
-        start = parent["start"]
-        end = parent["end"]
+        parent: Dict[str, Any] = self.structs[0]
+        start: int = parent["start"]
+        end: int = parent["end"]

        # Pick inner-most parent for our offset
        for struct in self.structs:
@@ -933,20 +1005,16 @@ class Scanner37Base(Scanner):


 if __name__ == "__main__":
-    from uncompyle6 import PYTHON_VERSION
+    from xdis.version_info import PYTHON_VERSION_TRIPLE, version_tuple_to_str

-    if PYTHON_VERSION >= 3.7:
+    if PYTHON_VERSION_TRIPLE[:2] == (3, 7):
        import inspect

-        co = inspect.currentframe().f_code
-        from uncompyle6 import PYTHON_VERSION
+        co = inspect.currentframe().f_code  # type: ignore

-        tokens, customize = Scanner37Base(PYTHON_VERSION).ingest(co)
+        tokens, customize = Scanner37Base(PYTHON_VERSION_TRIPLE).ingest(co)
        for t in tokens:
            print(t)
    else:
-        print(
-            "Need to be Python 3.7 or greater to demo; I am version {PYTHON_VERSION}."
-            % PYTHON_VERSION
-        )
+        print(f"Need to be Python 3.7 to demo; I am version {version_tuple_to_str()}.")
    pass