WIP - extend fast long-literals into older Python3

2025-08-04 09:22:40 +08:00 · 2022-04-25 07:32:24 -04:00
parent c6642f5899
commit bf58fb9cf2
4 changed files with 231 additions and 104 deletions
--- a/uncompyle6/parsers/parse3.py
+++ b/uncompyle6/parsers/parse3.py
@@ -814,6 +814,22 @@ class Python3Parser(PythonParser):
                rule = "starred ::= %s %s" % ("expr " * v, opname)
                self.addRule(rule, nop_func)
            elif opname in ("BUILD_CONST_LIST", "BUILD_CONST_DICT", "BUILD_CONST_SET"):
                if opname == "BUILD_CONST_DICT":
                    rule = f"""
                           add_consts          ::= ADD_VALUE*
                           const_list          ::= COLLECTION_START add_consts {opname}
                           dict                ::= const_list
                           expr                ::= dict
                           """
                else:
                    rule = f"""
                           add_consts          ::= ADD_VALUE*
                           const_list          ::= COLLECTION_START add_consts {opname}
                           expr                ::= const_list
                           """
                self.addRule(rule, nop_func)
            elif opname_base in (
                "BUILD_LIST",
                "BUILD_SET",
--- a/uncompyle6/scanner.py
+++ b/uncompyle6/scanner.py
@@ -125,80 +125,6 @@ class Scanner(object):
        # FIXME: This weird Python2 behavior is not Python3
        self.resetTokenClass()
    def bound_collection(
        self, tokens: list, next_tokens: list, t: Token, i: int, collection_type: str
    ):
        count = t.attr
        assert isinstance(count, int)
        assert count <= i
        if collection_type == "CONST_DICT":
            # constant dictonaries work via BUILD_CONST_KEY_MAP and
            # handle the values() like sets and lists.
            # However the keys() are an LOAD_CONST of the keys.
            # adjust offset to account for this
            count += 1
        # For small lists don't bother
        if count < 5:
            return next_tokens + [t]
        collection_start = i - count
        for j in range(collection_start, i):
            if tokens[j].kind not in (
                "LOAD_CONST",
                "LOAD_FAST",
                "LOAD_GLOBAL",
                "LOAD_NAME",
            ):
                return next_tokens + [t]
        collection_enum = CONST_COLLECTIONS.index(collection_type)
        # If we go there all instructions before tokens[i] are LOAD_CONST and we can replace
        # add a boundary marker and change LOAD_CONST to something else
        new_tokens = next_tokens[:-count]
        start_offset = tokens[collection_start].offset
        new_tokens.append(
            Token(
                opname="COLLECTION_START",
                attr=collection_enum,
                pattr=collection_type,
                offset=f"{start_offset}_0",
                has_arg=True,
                opc=self.opc,
                has_extended_arg=False,
            )
        )
        for j in range(collection_start, i):
            new_tokens.append(
                Token(
                    opname="ADD_VALUE",
                    attr=tokens[j].attr,
                    pattr=tokens[j].pattr,
                    offset=tokens[j].offset,
                    has_arg=True,
                    linestart=tokens[j].linestart,
                    opc=self.opc,
                    has_extended_arg=False,
                )
            )
        new_tokens.append(
            Token(
                opname=f"BUILD_{collection_type}",
                attr=t.attr,
                pattr=t.pattr,
                offset=t.offset,
                has_arg=t.has_arg,
                linestart=t.linestart,
                opc=t.opc,
                has_extended_arg=False,
            )
        )
        return new_tokens
    def build_instructions(self, co):
        """
        Create a list of instructions (a structured object rather than
--- a/uncompyle6/scanners/scanner3.py
+++ b/uncompyle6/scanners/scanner3.py
@@ -35,16 +35,19 @@ Finally we save token information.
 from __future__ import print_function
-from xdis import iscode, instruction_size
+from typing import Tuple
 from xdis import iscode, instruction_size, Instruction
 from xdis.bytecode import _get_const_info
-from uncompyle6.scanner import Token, parse_fn_counts
+from uncompyle6.scanners.tok import Token
 from uncompyle6.scanner import parse_fn_counts
 import xdis
 # Get all the opcodes into globals
 import xdis.opcodes.opcode_33 as op3
-from uncompyle6.scanner import Scanner
+from uncompyle6.scanner import Scanner, CONST_COLLECTIONS
 import sys
@@ -204,17 +207,108 @@ class Scanner3(Scanner):
        # self.varargs_ops = frozenset(self.opc.hasvargs)
        return
-    def ingest(self, co, classname=None, code_objects={}, show_asm=None):
+    def bound_collection_from_inst(
        self, insts: list, next_tokens: list, inst: Instruction, i: int, collection_type: str
    ) -> list:
        t = Token(
                opname=inst.opname,
                attr=inst.argval,
                pattr=inst.argrepr,
                offset=inst.offset,
                linestart=inst.starts_line,
                op=inst.opcode,
                has_arg=inst.has_arg,
                has_extended_arg=inst.has_extended_arg,
                opc=self.opc,
            )
        count = t.attr
        assert isinstance(count, int)
        assert count <= i
        if collection_type == "CONST_DICT":
            # constant dictonaries work via BUILD_CONST_KEY_MAP and
            # handle the values() like sets and lists.
            # However the keys() are an LOAD_CONST of the keys.
            # adjust offset to account for this
            count += 1
        # For small lists don't bother
        if count < 5:
            return next_tokens + [t]
        collection_start = i - count
        for j in range(collection_start, i):
            if insts[j].opname not in (
                "LOAD_CONST",
                "LOAD_FAST",
                "LOAD_GLOBAL",
                "LOAD_NAME",
            ):
                return next_tokens + [t]
        collection_enum = CONST_COLLECTIONS.index(collection_type)
        # If we get here, all instructions before tokens[i] are LOAD_CONST and we can replace
        # add a boundary marker and change LOAD_CONST to something else
        new_tokens = next_tokens[:-count]
        start_offset = insts[collection_start].offset
        new_tokens.append(
            Token(
                opname="COLLECTION_START",
                attr=collection_enum,
                pattr=collection_type,
                offset=f"{start_offset}_0",
                linestart=False,
                has_arg=True,
                has_extended_arg=False,
                opc=self.opc,
            )
        )
        for j in range(collection_start, i):
            new_tokens.append(
                Token(
                    opname="ADD_VALUE",
                    attr=insts[j].argval,
                    pattr=insts[j].argrepr,
                    offset=insts[j].offset,
                    linestart=insts[j].starts_line,
                    has_arg=True,
                    has_extended_arg=False,
                    opc=self.opc,
                )
            )
        new_tokens.append(
            Token(
                opname=f"BUILD_{collection_type}",
                attr=t.attr,
                pattr=t.pattr,
                offset=t.offset,
                linestart=t.linestart,
                has_arg=t.has_arg,
                has_extended_arg=False,
                opc=t.opc,
            )
        )
        return new_tokens
    def ingest(self, co, classname=None, code_objects={}, show_asm=None
        ) -> Tuple[list, dict]:
        """
-        Pick out tokens from an uncompyle6 code object, and transform them,
+        Create "tokens" the bytecode of an Python code object. Largely these
        are the opcode name, but in some cases that has been modified to make parsing
        easier.
        returning a list of uncompyle6 Token's.
-        The transformations are made to assist the deparsing grammar.
+        Some transformations are made to assist the deparsing grammar:
        Specificially:
           -  various types of LOAD_CONST's are categorized in terms of what they load
           -  COME_FROM instructions are added to assist parsing control structures
-           -  MAKE_FUNCTION and FUNCTION_CALLS append the number of positional arguments
+           -  operands with stack argument counts or flag masks are appended to the opcode name, e.g.:
-           -  some EXTENDED_ARGS instructions are removed
+              *  BUILD_LIST, BUILD_SET
              *  MAKE_FUNCTION and FUNCTION_CALLS append the number of positional arguments
           -  EXTENDED_ARGS instructions are removed
        Also, when we encounter certain tokens, we add them to a set which will cause custom
        grammar rules. Specifically, variable arg tokens like MAKE_FUNCTION or BUILD_LIST
@@ -231,9 +325,6 @@ class Scanner3(Scanner):
            for instr in bytecode.get_instructions(co):
                print(instr.disassemble())
        # list of tokens/instructions
        tokens = []
        # "customize" is in the process of going away here
        customize = {}
@@ -248,6 +339,7 @@ class Scanner3(Scanner):
        n = len(self.insts)
        for i, inst in enumerate(self.insts):
            opname = inst.opname
            # We need to detect the difference between:
            #   raise AssertionError
            #  and
@@ -258,7 +350,7 @@ class Scanner3(Scanner):
            if self.version[:2] == (3, 0):
                # Like 2.6, 3.0 doesn't have POP_JUMP_IF... so we have
                # to go through more machinations
-                assert_can_follow = inst.opname == "POP_TOP" and i + 1 < n
+                assert_can_follow = opname == "POP_TOP" and i + 1 < n
                if assert_can_follow:
                    prev_inst = self.insts[i - 1]
                    assert_can_follow = (
@@ -267,7 +359,7 @@ class Scanner3(Scanner):
                    jump_if_inst = prev_inst
            else:
                assert_can_follow = (
-                    inst.opname in ("POP_JUMP_IF_TRUE", "POP_JUMP_IF_FALSE")
+                    opname in ("POP_JUMP_IF_TRUE", "POP_JUMP_IF_FALSE")
                    and i + 1 < n
                )
                jump_if_inst = inst
@@ -291,13 +383,32 @@ class Scanner3(Scanner):
        # print("XXX2", jump_targets)
        last_op_was_break = False
        new_tokens = []
        for i, inst in enumerate(self.insts):
            opname = inst.opname
            # things that smash new_tokens like BUILD_LIST have to come first.
            if opname in (
                "BUILD_CONST_KEY_MAP",
                "BUILD_LIST",
                "BUILD_SET",
            ):
                collection_type = (
                    "DICT"
                    if opname.startswith("BUILD_CONST_KEY_MAP")
                    else opname.split("_")[1]
                )
                new_tokens = self.bound_collection_from_inst(
                    self.insts, new_tokens, inst, i, f"CONST_{collection_type}"
                )
                continue
            argval = inst.argval
            op = inst.opcode
-            if inst.opname == "EXTENDED_ARG":
+            if opname == "EXTENDED_ARG":
                # FIXME: The EXTENDED_ARG is used to signal annotation
                # parameters
                if i + 1 < n and self.insts[i + 1].opcode != self.opc.MAKE_FUNCTION:
@@ -324,7 +435,7 @@ class Scanner3(Scanner):
                        pass
                    elif inst.offset in self.except_targets:
                        come_from_name = "COME_FROM_EXCEPT_CLAUSE"
-                    tokens.append(
+                    new_tokens.append(
                        Token(
                            come_from_name,
                            jump_offset,
@@ -339,7 +450,7 @@ class Scanner3(Scanner):
                pass
            elif inst.offset in self.else_start:
                end_offset = self.else_start[inst.offset]
-                tokens.append(
+                new_tokens.append(
                    Token(
                        "ELSE",
                        None,
@@ -353,7 +464,6 @@ class Scanner3(Scanner):
                pass
            pattr = inst.argrepr
            opname = inst.opname
            if op in self.opc.CONST_OPS:
                const = argval
@@ -422,7 +532,7 @@ class Scanner3(Scanner):
                        pass
                    opname = "%s_%d" % (opname, pos_args)
                    attr = (pos_args, name_pair_args, annotate_args)
-                tokens.append(
+                new_tokens.append(
                    Token(
                        opname=opname,
                        attr=attr,
@@ -508,12 +618,12 @@ class Scanner3(Scanner):
                        # the "continue" is not on a new line.
                        # There are other situations where we don't catch
                        # CONTINUE as well.
-                        if tokens[-1].kind == "JUMP_BACK" and tokens[-1].attr <= argval:
+                        if new_tokens[-1].kind == "JUMP_BACK" and new_tokens[-1].attr <= argval:
-                            if tokens[-2].kind == "BREAK_LOOP":
+                            if new_tokens[-2].kind == "BREAK_LOOP":
-                                del tokens[-1]
+                                del new_tokens[-1]
                            else:
                                # intern is used because we are changing the *previous* token
-                                tokens[-1].kind = intern("CONTINUE")
+                                new_tokens[-1].kind = intern("CONTINUE")
                    if last_op_was_break and opname == "CONTINUE":
                        last_op_was_break = False
                        continue
@@ -527,7 +637,7 @@ class Scanner3(Scanner):
                opname = "LOAD_ASSERT"
            last_op_was_break = opname == "BREAK_LOOP"
-            tokens.append(
+            new_tokens.append(
                Token(
                    opname=opname,
                    attr=argval,
@@ -542,10 +652,10 @@ class Scanner3(Scanner):
            pass
        if show_asm in ("both", "after"):
-            for t in tokens:
+            for t in new_tokens:
                print(t.format(line_prefix=""))
            print()
-        return tokens, customize
+        return new_tokens, customize
    def find_jump_targets(self, debug):
        """
--- a/uncompyle6/scanners/scanner37.py
+++ b/uncompyle6/scanners/scanner37.py
@@ -23,6 +23,9 @@ scanner routine for Python 3.
 """
 from typing import Tuple
 from uncompyle6.scanner import CONST_COLLECTIONS
 from uncompyle6.scanners.tok import Token
 from uncompyle6.scanners.scanner37base import Scanner37Base
 # bytecode verification, verify(), uses JUMP_OPs from here
@@ -31,9 +34,6 @@ from xdis.opcodes import opcode_37 as opc
 # bytecode verification, verify(), uses JUMP_OPS from here
 JUMP_OPs = opc.JUMP_OPS
 CONST_COLLECTIONS = ("CONST_LIST", "CONST_SET", "CONST_DICT")
 class Scanner37(Scanner37Base):
    def __init__(self, show_asm=None, is_pypy: bool=False):
        Scanner37Base.__init__(self, (3, 7), show_asm)
@@ -42,6 +42,81 @@ class Scanner37(Scanner37Base):
    pass
    def bound_collection_from_tokens(
        self, tokens: list, next_tokens: list, t: Token, i: int, collection_type: str
    ) -> list:
        count = t.attr
        assert isinstance(count, int)
        assert count <= i
        if collection_type == "CONST_DICT":
            # constant dictonaries work via BUILD_CONST_KEY_MAP and
            # handle the values() like sets and lists.
            # However the keys() are an LOAD_CONST of the keys.
            # adjust offset to account for this
            count += 1
        # For small lists don't bother
        if count < 5:
            return next_tokens + [t]
        collection_start = i - count
        for j in range(collection_start, i):
            if tokens[j].kind not in (
                "LOAD_CONST",
                "LOAD_FAST",
                "LOAD_GLOBAL",
                "LOAD_NAME",
            ):
                return next_tokens + [t]
        collection_enum = CONST_COLLECTIONS.index(collection_type)
        # If we go there all instructions before tokens[i] are LOAD_CONST and we can replace
        # add a boundary marker and change LOAD_CONST to something else
        new_tokens = next_tokens[:-count]
        start_offset = tokens[collection_start].offset
        new_tokens.append(
            Token(
                opname="COLLECTION_START",
                attr=collection_enum,
                pattr=collection_type,
                offset=f"{start_offset}_0",
                linestart=False,
                has_arg=True,
                has_extended_arg=False,
                opc=self.opc,
            )
        )
        for j in range(collection_start, i):
            new_tokens.append(
                Token(
                    opname="ADD_VALUE",
                    attr=tokens[j].attr,
                    pattr=tokens[j].pattr,
                    offset=tokens[j].offset,
                    linestart=tokens[j].linestart,
                    has_arg=True,
                    has_extended_arg=False,
                    opc=self.opc,
                )
            )
        new_tokens.append(
            Token(
                opname=f"BUILD_{collection_type}",
                attr=t.attr,
                pattr=t.pattr,
                offset=t.offset,
                linestart=t.linestart,
                has_arg=t.has_arg,
                has_extended_arg=False,
                opc=t.opc,
            )
        )
        return new_tokens
    def ingest(
        self, co, classname=None, code_objects={}, show_asm=None
    ) -> Tuple[list, dict]:
@@ -77,7 +152,7 @@ class Scanner37(Scanner37Base):
                    if t.kind.startswith("BUILD_CONST_KEY_MAP")
                    else t.kind.split("_")[1]
                )
-                new_tokens = self.bound_collection(
+                new_tokens = self.bound_collection_from_tokens(
                    tokens, new_tokens, t, i, f"CONST_{collection_type}"
                )
                continue