Handle BUILD_{LIST,SET} more efficiently

2025-08-04 09:22:40 +08:00 · 2022-04-27 09:23:38 -04:00
parent 5220aa3b65
commit fa62724f14
7 changed files with 79 additions and 199 deletions
--- a/test/bytecode_2.7_run/05_long_literals.pyc
+++ b/test/bytecode_2.7_run/05_long_literals.pyc
--- a/uncompyle6/parsers/parse2.py
+++ b/uncompyle6/parsers/parse2.py
@@ -310,6 +310,14 @@ class Python2Parser(PythonParser):

            opname_base = opname[: opname.rfind("_")]

+            if opname in ("BUILD_CONST_LIST", "BUILD_CONST_SET"):
+                rule = """
+                       add_consts          ::= ADD_VALUE*
+                       const_list          ::= COLLECTION_START add_consts %s
+                       expr                ::= const_list
+                       """ % opname
+                self.addRule(rule, nop_func)
+
            # The order of opname listed is roughly sorted below
            if opname_base in ("BUILD_LIST", "BUILD_SET", "BUILD_TUPLE"):
                # We do this complicated test to speed up parsing of
--- a/uncompyle6/parsers/parse3.py
+++ b/uncompyle6/parsers/parse3.py
@@ -750,18 +750,37 @@ class Python3Parser(PythonParser):
                kvlist_n = "expr " * (token.attr)
                rule = "dict ::= %sLOAD_CONST %s" % (kvlist_n, opname)
                self.addRule(rule, nop_func)
+
+            elif opname in ("BUILD_CONST_LIST", "BUILD_CONST_DICT", "BUILD_CONST_SET"):
+                if opname == "BUILD_CONST_DICT":
+                    rule = """
+                           add_consts          ::= ADD_VALUE*
+                           const_list          ::= COLLECTION_START add_consts %s
+                           dict                ::= const_list
+                           expr                ::= dict
+                           """ % opname
+                else:
+                    rule = """
+                           add_consts          ::= ADD_VALUE*
+                           const_list          ::= COLLECTION_START add_consts %s
+                           expr                ::= const_list
+                           """ % opname
+                self.addRule(rule, nop_func)
+
            elif opname.startswith("BUILD_DICT_OLDER"):
                rule = """dict ::= COLLECTION_START key_value_pairs BUILD_DICT_OLDER
                          key_value_pairs ::= key_value_pair+
                          key_value_pair  ::= ADD_KEY ADD_VALUE
                       """
                self.addRule(rule, nop_func)
+
            elif opname.startswith("BUILD_LIST_UNPACK"):
                v = token.attr
                rule = "build_list_unpack ::= %s%s" % ("expr " * v, opname)
                self.addRule(rule, nop_func)
                rule = "expr ::= build_list_unpack"
                self.addRule(rule, nop_func)
+
            elif opname_base in ("BUILD_MAP", "BUILD_MAP_UNPACK"):
                kvlist_n = "kvlist_%s" % token.attr
                if opname == "BUILD_MAP_n":
@@ -822,22 +841,6 @@ class Python3Parser(PythonParser):
                rule = "starred ::= %s %s" % ("expr " * v, opname)
                self.addRule(rule, nop_func)

-            elif opname in ("BUILD_CONST_LIST", "BUILD_CONST_DICT", "BUILD_CONST_SET"):
-                if opname == "BUILD_CONST_DICT":
-                    rule = """
-                           add_consts          ::= ADD_VALUE*
-                           const_list          ::= COLLECTION_START add_consts %s
-                           dict                ::= const_list
-                           expr                ::= dict
-                           """ % opname
-                else:
-                    rule = """
-                           add_consts          ::= ADD_VALUE*
-                           const_list          ::= COLLECTION_START add_consts %s
-                           expr                ::= const_list
-                           """ % opname
-                self.addRule(rule, nop_func)
-
            elif opname_base in (
                "BUILD_LIST",
                "BUILD_SET",
--- a/uncompyle6/scanner.py
+++ b/uncompyle6/scanner.py
@@ -128,8 +128,8 @@ class Scanner(object):
        # FIXME: This weird Python2 behavior is not Python3
        self.resetTokenClass()

-    def bound_collection(
-        self, tokens, next_tokens, t, i, collection_type
+    def bound_collection_from_tokens(
+        self, tokens, t, i, collection_type
    ):
        count = t.attr
        assert isinstance(count, int)
@@ -145,7 +145,7 @@ class Scanner(object):

        # For small lists don't bother
        if count < 5:
-            return next_tokens + [t]
+            return None

        collection_start = i - count

@@ -156,13 +156,13 @@ class Scanner(object):
                "LOAD_GLOBAL",
                "LOAD_NAME",
            ):
-                return next_tokens + [t]
+                return None

        collection_enum = CONST_COLLECTIONS.index(collection_type)

        # If we go there all instructions before tokens[i] are LOAD_CONST and we can replace
        # add a boundary marker and change LOAD_CONST to something else
-        new_tokens = next_tokens[:-count]
+        new_tokens = tokens[:-count]
        start_offset = tokens[collection_start].offset
        new_tokens.append(
            Token(
--- a/uncompyle6/scanners/scanner2.py
+++ b/uncompyle6/scanners/scanner2.py
@@ -134,96 +134,6 @@ class Scanner2(Scanner):
            ]
        )

-    def bound_collection_from_tokens(
-        self, tokens, t, i, c, collection_type):
-        """
-        Try to a replace sequence of instruction that ends with a BUILD_LIST with a sequence that can
-        be parsed much faster, but inserting the token boundary at the beginning of the sequence.
-        """
-        count = t.attr
-        assert isinstance(count, int)
-        if count > i:
-            return None
-
-        # For small lists don't bother
-        if count < 5:
-            return None
-
-        collection_start = i - (count * 2)
-        assert (count * 2) <= i
-
-        for j in range(collection_start, i, 2):
-            try:
-                tokens[j]
-            except:
-                from trepan.api import debug; debug()
-            if tokens[j].opname not in (
-                "LOAD_CONST",
-            ):
-                return None
-            if tokens[j+1].opname not in (
-                "LOAD_CONST",
-            ):
-                return None
-
-        collection_start = i - (2 * count)
-        collection_enum = CONST_COLLECTIONS.index("CONST_MAP")
-
-        # If we get here, all instructions before tokens[i] are LOAD_CONST and we can replace
-        # add a boundary marker and change LOAD_CONST to something else
-        new_tokens = tokens[:-(2*count)]
-        start_offset = tokens[collection_start].offset
-        new_tokens.append(
-            Token(
-                opname="COLLECTION_START",
-                attr=collection_enum,
-                pattr="CONST_MAP",
-                offset="%s_0" % start_offset,
-                linestart=False,
-                has_arg=True,
-                has_extended_arg=False,
-                opc=self.opc,
-            )
-        )
-        for j in range(collection_start, i, 2):
-            new_tokens.append(
-                Token(
-                    opname="ADD_KEY",
-                    attr=tokens[j].argval,
-                    pattr=tokens[j].argrepr,
-                    offset=tokens[j].offset,
-                    linestart=tokens[j].starts_line,
-                    has_arg=True,
-                    has_extended_arg=False,
-                    opc=self.opc,
-                )
-            )
-            new_tokens.append(
-                Token(
-                    opname="ADD_VALUE",
-                    attr=tokens[j+1].argval,
-                    pattr=tokens[j+1].argrepr,
-                    offset=tokens[j+1].offset,
-                    linestart=tokens[j+1].starts_line,
-                    has_arg=True,
-                    has_extended_arg=False,
-                    opc=self.opc,
-                )
-            )
-        new_tokens.append(
-            Token(
-                opname=collection_type,
-                attr=t.attr,
-                pattr=t.pattr,
-                offset=t.offset,
-                linestart=t.linestart,
-                has_arg=t.has_arg,
-                has_extended_arg=False,
-                opc=t.opc,
-            )
-        )
-        return new_tokens
-
    @staticmethod
    def extended_arg_val(arg):
        """Return integer value of an EXTENDED_ARG operand.
@@ -287,7 +197,6 @@ class Scanner2(Scanner):
        grammar rules. Specifically, variable arg tokens like MAKE_FUNCTION or BUILD_LIST
        cause specific rules for the specific number of arguments they take.
        """
-
        if not show_asm:
            show_asm = self.show_asm

@@ -400,9 +309,24 @@ class Scanner2(Scanner):
                if op == self.opc.EXTENDED_ARG:
                    extended_arg += self.extended_arg_val(oparg)
                    continue
-                ###
-                # Start here: look for BUILD_LIST
-                ###
+
+                # Note: name used to match on rather than op since
+                # BUILD_SET isn't in earlier Pythons.
+                if op_name in (
+                    "BUILD_LIST",
+                    "BUILD_SET",
+                ):
+                    t = Token(
+                        op_name, oparg, pattr, offset, self.linestarts.get(offset, None), op, has_arg, self.opc
+                    )
+                    collection_type = op_name.split("_")[1]
+                    next_tokens = self.bound_collection_from_tokens(
+                        new_tokens, t, len(new_tokens), "CONST_%s" % collection_type
+                    )
+                    if next_tokens is not None:
+                        new_tokens = next_tokens
+                        continue
+
                if op in self.opc.CONST_OPS:
                    const = co.co_consts[oparg]
                    if iscode(const):
--- a/uncompyle6/scanners/scanner26.py
+++ b/uncompyle6/scanners/scanner26.py
@@ -121,7 +121,9 @@ class Scanner26(scan.Scanner2):
            i = self.next_stmt[i]

        extended_arg = 0
+        i = -1
        for offset in self.op_range(0, codelen):
+            i += 1
            op = self.code[offset]
            op_name = self.opname[op]
            oparg = None; pattr = None
@@ -154,8 +156,28 @@ class Scanner26(scan.Scanner2):
                oparg = self.get_argument(offset) + extended_arg
                extended_arg = 0
                if op == self.opc.EXTENDED_ARG:
-                    extended_arg = oparg * L65536
+                     extended_arg += self.extended_arg_val(oparg)
                     continue
+
+
+                # Note: name used to match on rather than op since
+                # BUILD_SET isn't in earlier Pythons.
+                if op_name in (
+                    "BUILD_LIST",
+                    "BUILD_SET",
+                ):
+                    t = Token(
+                        op_name, oparg, pattr, offset, self.linestarts.get(offset, None), op, has_arg, self.opc
+                    )
+
+                    collection_type = op_name.split("_")[1]
+                    next_tokens = self.bound_collection_from_tokens(
+                        tokens, t, i, "CONST_%s" % collection_type
+                    )
+                    if next_tokens is not None:
+                        tokens = next_tokens
+                        continue
+
                if op in self.opc.CONST_OPS:
                    const = co.co_consts[oparg]
                    # We can't use inspect.iscode() because we may be
--- a/uncompyle6/scanners/scanner37.py
+++ b/uncompyle6/scanners/scanner37.py
@@ -22,8 +22,6 @@ This sets up opcodes Python's 3.7 and calls a generalized
 scanner routine for Python 3.
 """

-from uncompyle6.scanner import CONST_COLLECTIONS
-from uncompyle6.scanners.tok import Token
 from uncompyle6.scanners.scanner37base import Scanner37Base

 # bytecode verification, verify(), uses JUMP_OPs from here
@@ -40,83 +38,6 @@ class Scanner37(Scanner37Base):

    pass

-    def bound_collection_from_tokens(
-        self, tokens, next_tokens, t, i, collection_type
-    ):
-        count = t.attr
-        assert isinstance(count, int)
-
-        assert count <= i
-
-        if collection_type == "CONST_DICT":
-            # constant dictonaries work via BUILD_CONST_KEY_MAP and
-            # handle the values() like sets and lists.
-            # However the keys() are an LOAD_CONST of the keys.
-            # adjust offset to account for this
-            count += 1
-
-        # For small lists don't bother
-        if count < 5:
-            return next_tokens + [t]
-
-        collection_start = i - count
-
-        for j in range(collection_start, i):
-            if tokens[j].kind not in (
-                "LOAD_CODE",
-                "LOAD_CONST",
-                "LOAD_FAST",
-                "LOAD_GLOBAL",
-                "LOAD_NAME",
-                "LOAD_STR",
-            ):
-                return next_tokens + [t]
-
-        collection_enum = CONST_COLLECTIONS.index(collection_type)
-
-        # If we get here, all instructions before tokens[i] are LOAD_CONST and we can replace
-        # add a boundary marker and change LOAD_CONST to something else.
-        new_tokens = next_tokens[:-count]
-        start_offset = tokens[collection_start].offset
-        new_tokens.append(
-            Token(
-                opname="COLLECTION_START",
-                attr=collection_enum,
-                pattr=collection_type,
-                offset="%s_0" % start_offset,
-                linestart=False,
-                has_arg=True,
-                has_extended_arg=False,
-                opc=self.opc,
-            )
-        )
-        for j in range(collection_start, i):
-            new_tokens.append(
-                Token(
-                    opname="ADD_VALUE",
-                    attr=tokens[j].attr,
-                    pattr=tokens[j].pattr,
-                    offset=tokens[j].offset,
-                    linestart=tokens[j].linestart,
-                    has_arg=True,
-                    has_extended_arg=False,
-                    opc=self.opc,
-                )
-            )
-        new_tokens.append(
-            Token(
-                opname="BUILD_%s" % collection_type,
-                attr=t.attr,
-                pattr=t.pattr,
-                offset=t.offset,
-                linestart=t.linestart,
-                has_arg=t.has_arg,
-                has_extended_arg=False,
-                opc=t.opc,
-            )
-        )
-        return new_tokens
-
    def ingest(
        self, co, classname=None, code_objects={}, show_asm=None
    ):
@@ -151,9 +72,11 @@ class Scanner37(Scanner37Base):
                    collection_type = "DICT"
                else:
                    collection_type = t.kind.split("_")[1]
-                new_tokens = self.bound_collection(
-                    tokens, new_tokens, t, i, "CONST_%s" % collection_type
+                next_tokens = self.bound_collection_from_tokens(
+                    new_tokens, t, i, "CONST_%s" % collection_type
                )
+                if next_tokens is not None:
+                    new_tokens = next_tokens
                    continue

            # The lowest bit of flags indicates whether the