diff --git a/test/bytecode_3.3_run/05_long_literals.pyc b/test/bytecode_3.3_run/05_long_literals.pyc new file mode 100644 index 00000000..69eab14b Binary files /dev/null and b/test/bytecode_3.3_run/05_long_literals.pyc differ diff --git a/test/bytecode_3.6_run/05_long_literals.pyc b/test/bytecode_3.6_run/05_long_literals.pyc new file mode 100644 index 00000000..e73e3a0f Binary files /dev/null and b/test/bytecode_3.6_run/05_long_literals.pyc differ diff --git a/uncompyle6/parsers/parse3.py b/uncompyle6/parsers/parse3.py index 975a19c5..039a51db 100644 --- a/uncompyle6/parsers/parse3.py +++ b/uncompyle6/parsers/parse3.py @@ -814,6 +814,22 @@ class Python3Parser(PythonParser): rule = "starred ::= %s %s" % ("expr " * v, opname) self.addRule(rule, nop_func) + elif opname in ("BUILD_CONST_LIST", "BUILD_CONST_DICT", "BUILD_CONST_SET"): + if opname == "BUILD_CONST_DICT": + rule = f""" + add_consts ::= ADD_VALUE* + const_list ::= COLLECTION_START add_consts {opname} + dict ::= const_list + expr ::= dict + """ + else: + rule = f""" + add_consts ::= ADD_VALUE* + const_list ::= COLLECTION_START add_consts {opname} + expr ::= const_list + """ + self.addRule(rule, nop_func) + elif opname_base in ( "BUILD_LIST", "BUILD_SET", diff --git a/uncompyle6/scanner.py b/uncompyle6/scanner.py index 30dcab2d..87b6fc3b 100644 --- a/uncompyle6/scanner.py +++ b/uncompyle6/scanner.py @@ -125,80 +125,6 @@ class Scanner(object): # FIXME: This weird Python2 behavior is not Python3 self.resetTokenClass() - def bound_collection( - self, tokens: list, next_tokens: list, t: Token, i: int, collection_type: str - ): - count = t.attr - assert isinstance(count, int) - - assert count <= i - - if collection_type == "CONST_DICT": - # constant dictonaries work via BUILD_CONST_KEY_MAP and - # handle the values() like sets and lists. - # However the keys() are an LOAD_CONST of the keys. - # adjust offset to account for this - count += 1 - - # For small lists don't bother - if count < 5: - return next_tokens + [t] - - collection_start = i - count - - for j in range(collection_start, i): - if tokens[j].kind not in ( - "LOAD_CONST", - "LOAD_FAST", - "LOAD_GLOBAL", - "LOAD_NAME", - ): - return next_tokens + [t] - - collection_enum = CONST_COLLECTIONS.index(collection_type) - - # If we go there all instructions before tokens[i] are LOAD_CONST and we can replace - # add a boundary marker and change LOAD_CONST to something else - new_tokens = next_tokens[:-count] - start_offset = tokens[collection_start].offset - new_tokens.append( - Token( - opname="COLLECTION_START", - attr=collection_enum, - pattr=collection_type, - offset=f"{start_offset}_0", - has_arg=True, - opc=self.opc, - has_extended_arg=False, - ) - ) - for j in range(collection_start, i): - new_tokens.append( - Token( - opname="ADD_VALUE", - attr=tokens[j].attr, - pattr=tokens[j].pattr, - offset=tokens[j].offset, - has_arg=True, - linestart=tokens[j].linestart, - opc=self.opc, - has_extended_arg=False, - ) - ) - new_tokens.append( - Token( - opname=f"BUILD_{collection_type}", - attr=t.attr, - pattr=t.pattr, - offset=t.offset, - has_arg=t.has_arg, - linestart=t.linestart, - opc=t.opc, - has_extended_arg=False, - ) - ) - return new_tokens - def build_instructions(self, co): """ Create a list of instructions (a structured object rather than diff --git a/uncompyle6/scanners/scanner3.py b/uncompyle6/scanners/scanner3.py index 1e7396e5..f20803a2 100644 --- a/uncompyle6/scanners/scanner3.py +++ b/uncompyle6/scanners/scanner3.py @@ -35,16 +35,19 @@ Finally we save token information. from __future__ import print_function -from xdis import iscode, instruction_size +from typing import Optional, Tuple + +from xdis import iscode, instruction_size, Instruction from xdis.bytecode import _get_const_info -from uncompyle6.scanner import Token, parse_fn_counts +from uncompyle6.scanners.tok import Token +from uncompyle6.scanner import parse_fn_counts import xdis # Get all the opcodes into globals import xdis.opcodes.opcode_33 as op3 -from uncompyle6.scanner import Scanner +from uncompyle6.scanner import Scanner, CONST_COLLECTIONS import sys @@ -204,17 +207,99 @@ class Scanner3(Scanner): # self.varargs_ops = frozenset(self.opc.hasvargs) return - def ingest(self, co, classname=None, code_objects={}, show_asm=None): + def bound_collection_from_inst( + self, insts: list, next_tokens: list, inst: Instruction, t: Token, i: int, collection_type: str + ) -> Optional[list]: + count = t.attr + assert isinstance(count, int) + + assert count <= i + + if collection_type == "CONST_DICT": + # constant dictonaries work via BUILD_CONST_KEY_MAP and + # handle the values() like sets and lists. + # However the keys() are an LOAD_CONST of the keys. + # adjust offset to account for this + count += 1 + + # For small lists don't bother + if count < 5: + return None + + collection_start = i - count + + for j in range(collection_start, i): + if insts[j].opname not in ( + "LOAD_ASSERT", + "LOAD_CODE", + "LOAD_CONST", + "LOAD_FAST", + "LOAD_GLOBAL", + "LOAD_NAME", + "LOAD_STR", + ): + return None + + collection_enum = CONST_COLLECTIONS.index(collection_type) + + # If we get here, all instructions before tokens[i] are LOAD_CONST and we can replace + # add a boundary marker and change LOAD_CONST to something else + new_tokens = next_tokens[:-count] + start_offset = insts[collection_start].offset + new_tokens.append( + Token( + opname="COLLECTION_START", + attr=collection_enum, + pattr=collection_type, + offset=f"{start_offset}_0", + linestart=False, + has_arg=True, + has_extended_arg=False, + opc=self.opc, + ) + ) + for j in range(collection_start, i): + new_tokens.append( + Token( + opname="ADD_VALUE", + attr=insts[j].argval, + pattr=insts[j].argrepr, + offset=insts[j].offset, + linestart=insts[j].starts_line, + has_arg=True, + has_extended_arg=False, + opc=self.opc, + ) + ) + new_tokens.append( + Token( + opname=f"BUILD_{collection_type}", + attr=t.attr, + pattr=t.pattr, + offset=t.offset, + linestart=t.linestart, + has_arg=t.has_arg, + has_extended_arg=False, + opc=t.opc, + ) + ) + return new_tokens + + def ingest(self, co, classname=None, code_objects={}, show_asm=None + ) -> Tuple[list, dict]: """ - Pick out tokens from an uncompyle6 code object, and transform them, + Create "tokens" the bytecode of an Python code object. Largely these + are the opcode name, but in some cases that has been modified to make parsing + easier. returning a list of uncompyle6 Token's. - The transformations are made to assist the deparsing grammar. - Specificially: + Some transformations are made to assist the deparsing grammar: - various types of LOAD_CONST's are categorized in terms of what they load - COME_FROM instructions are added to assist parsing control structures - - MAKE_FUNCTION and FUNCTION_CALLS append the number of positional arguments - - some EXTENDED_ARGS instructions are removed + - operands with stack argument counts or flag masks are appended to the opcode name, e.g.: + * BUILD_LIST, BUILD_SET + * MAKE_FUNCTION and FUNCTION_CALLS append the number of positional arguments + - EXTENDED_ARGS instructions are removed Also, when we encounter certain tokens, we add them to a set which will cause custom grammar rules. Specifically, variable arg tokens like MAKE_FUNCTION or BUILD_LIST @@ -231,9 +316,6 @@ class Scanner3(Scanner): for instr in bytecode.get_instructions(co): print(instr.disassemble()) - # list of tokens/instructions - tokens = [] - # "customize" is in the process of going away here customize = {} @@ -248,6 +330,7 @@ class Scanner3(Scanner): n = len(self.insts) for i, inst in enumerate(self.insts): + opname = inst.opname # We need to detect the difference between: # raise AssertionError # and @@ -258,7 +341,7 @@ class Scanner3(Scanner): if self.version[:2] == (3, 0): # Like 2.6, 3.0 doesn't have POP_JUMP_IF... so we have # to go through more machinations - assert_can_follow = inst.opname == "POP_TOP" and i + 1 < n + assert_can_follow = opname == "POP_TOP" and i + 1 < n if assert_can_follow: prev_inst = self.insts[i - 1] assert_can_follow = ( @@ -267,7 +350,7 @@ class Scanner3(Scanner): jump_if_inst = prev_inst else: assert_can_follow = ( - inst.opname in ("POP_JUMP_IF_TRUE", "POP_JUMP_IF_FALSE") + opname in ("POP_JUMP_IF_TRUE", "POP_JUMP_IF_FALSE") and i + 1 < n ) jump_if_inst = inst @@ -291,13 +374,48 @@ class Scanner3(Scanner): # print("XXX2", jump_targets) last_op_was_break = False + new_tokens = [] for i, inst in enumerate(self.insts): + opname = inst.opname + argval = inst.argval + pattr = inst.argrepr + + t = Token( + opname=opname, + attr=argval, + pattr=pattr, + offset=inst.offset, + linestart=inst.starts_line, + op=inst.opcode, + has_arg=inst.has_arg, + has_extended_arg=inst.has_extended_arg, + opc=self.opc, + ) + + # things that smash new_tokens like BUILD_LIST have to come first. + if opname in ( + "BUILD_CONST_KEY_MAP", + "BUILD_LIST", + "BUILD_SET", + ): + collection_type = ( + "DICT" + if opname.startswith("BUILD_CONST_KEY_MAP") + else opname.split("_")[1] + ) + try_tokens = self.bound_collection_from_inst( + self.insts, new_tokens, inst, t, i, f"CONST_{collection_type}" + ) + if try_tokens is not None: + new_tokens = try_tokens + continue + argval = inst.argval op = inst.opcode - if inst.opname == "EXTENDED_ARG": + if opname == "EXTENDED_ARG": # FIXME: The EXTENDED_ARG is used to signal annotation # parameters if i + 1 < n and self.insts[i + 1].opcode != self.opc.MAKE_FUNCTION: @@ -313,18 +431,18 @@ class Scanner3(Scanner): # "loop" tag last so the grammar rule matches that properly. for jump_offset in sorted(jump_targets[inst.offset], reverse=True): come_from_name = "COME_FROM" - opname = self.opname_for_offset(jump_offset) - if opname == "EXTENDED_ARG": + come_from_opname = self.opname_for_offset(jump_offset) + if come_from_opname == "EXTENDED_ARG": j = xdis.next_offset(op, self.opc, jump_offset) - opname = self.opname_for_offset(j) + come_from_opname = self.opname_for_offset(j) - if opname.startswith("SETUP_"): - come_from_type = opname[len("SETUP_") :] + if come_from_opname.startswith("SETUP_"): + come_from_type = come_from_opname[len("SETUP_") :] come_from_name = "COME_FROM_%s" % come_from_type pass elif inst.offset in self.except_targets: come_from_name = "COME_FROM_EXCEPT_CLAUSE" - tokens.append( + new_tokens.append( Token( come_from_name, jump_offset, @@ -339,7 +457,7 @@ class Scanner3(Scanner): pass elif inst.offset in self.else_start: end_offset = self.else_start[inst.offset] - tokens.append( + new_tokens.append( Token( "ELSE", None, @@ -352,9 +470,6 @@ class Scanner3(Scanner): pass - pattr = inst.argrepr - opname = inst.opname - if op in self.opc.CONST_OPS: const = argval if iscode(const): @@ -422,7 +537,7 @@ class Scanner3(Scanner): pass opname = "%s_%d" % (opname, pos_args) attr = (pos_args, name_pair_args, annotate_args) - tokens.append( + new_tokens.append( Token( opname=opname, attr=attr, @@ -508,12 +623,12 @@ class Scanner3(Scanner): # the "continue" is not on a new line. # There are other situations where we don't catch # CONTINUE as well. - if tokens[-1].kind == "JUMP_BACK" and tokens[-1].attr <= argval: - if tokens[-2].kind == "BREAK_LOOP": - del tokens[-1] + if new_tokens[-1].kind == "JUMP_BACK" and new_tokens[-1].attr <= argval: + if new_tokens[-2].kind == "BREAK_LOOP": + del new_tokens[-1] else: # intern is used because we are changing the *previous* token - tokens[-1].kind = intern("CONTINUE") + new_tokens[-1].kind = intern("CONTINUE") if last_op_was_break and opname == "CONTINUE": last_op_was_break = False continue @@ -527,25 +642,17 @@ class Scanner3(Scanner): opname = "LOAD_ASSERT" last_op_was_break = opname == "BREAK_LOOP" - tokens.append( - Token( - opname=opname, - attr=argval, - pattr=pattr, - offset=inst.offset, - linestart=inst.starts_line, - op=op, - has_arg=inst.has_arg, - opc=self.opc, - ) - ) + t.kind = opname + t.attr = argval + t.pattr = pattr + new_tokens.append(t) pass if show_asm in ("both", "after"): - for t in tokens: + for t in new_tokens: print(t.format(line_prefix="")) print() - return tokens, customize + return new_tokens, customize def find_jump_targets(self, debug): """ diff --git a/uncompyle6/scanners/scanner37.py b/uncompyle6/scanners/scanner37.py index 7ccfc91c..c93a822d 100644 --- a/uncompyle6/scanners/scanner37.py +++ b/uncompyle6/scanners/scanner37.py @@ -23,6 +23,9 @@ scanner routine for Python 3. """ from typing import Tuple + +from uncompyle6.scanner import CONST_COLLECTIONS +from uncompyle6.scanners.tok import Token from uncompyle6.scanners.scanner37base import Scanner37Base # bytecode verification, verify(), uses JUMP_OPs from here @@ -31,9 +34,6 @@ from xdis.opcodes import opcode_37 as opc # bytecode verification, verify(), uses JUMP_OPS from here JUMP_OPs = opc.JUMP_OPS -CONST_COLLECTIONS = ("CONST_LIST", "CONST_SET", "CONST_DICT") - - class Scanner37(Scanner37Base): def __init__(self, show_asm=None, is_pypy: bool=False): Scanner37Base.__init__(self, (3, 7), show_asm) @@ -42,6 +42,83 @@ class Scanner37(Scanner37Base): pass + def bound_collection_from_tokens( + self, tokens: list, next_tokens: list, t: Token, i: int, collection_type: str + ) -> list: + count = t.attr + assert isinstance(count, int) + + assert count <= i + + if collection_type == "CONST_DICT": + # constant dictonaries work via BUILD_CONST_KEY_MAP and + # handle the values() like sets and lists. + # However the keys() are an LOAD_CONST of the keys. + # adjust offset to account for this + count += 1 + + # For small lists don't bother + if count < 5: + return next_tokens + [t] + + collection_start = i - count + + for j in range(collection_start, i): + if tokens[j].kind not in ( + "LOAD_CODE", + "LOAD_CONST", + "LOAD_FAST", + "LOAD_GLOBAL", + "LOAD_NAME", + "LOAD_STR", + ): + return next_tokens + [t] + + collection_enum = CONST_COLLECTIONS.index(collection_type) + + # If we go there all instructions before tokens[i] are LOAD_CONST and we can replace + # add a boundary marker and change LOAD_CONST to something else + new_tokens = next_tokens[:-count] + start_offset = tokens[collection_start].offset + new_tokens.append( + Token( + opname="COLLECTION_START", + attr=collection_enum, + pattr=collection_type, + offset=f"{start_offset}_0", + linestart=False, + has_arg=True, + has_extended_arg=False, + opc=self.opc, + ) + ) + for j in range(collection_start, i): + new_tokens.append( + Token( + opname="ADD_VALUE", + attr=tokens[j].attr, + pattr=tokens[j].pattr, + offset=tokens[j].offset, + linestart=tokens[j].linestart, + has_arg=True, + has_extended_arg=False, + opc=self.opc, + ) + ) + new_tokens.append( + Token( + opname=f"BUILD_{collection_type}", + attr=t.attr, + pattr=t.pattr, + offset=t.offset, + linestart=t.linestart, + has_arg=t.has_arg, + has_extended_arg=False, + opc=t.opc, + ) + ) + return new_tokens + def ingest( self, co, classname=None, code_objects={}, show_asm=None ) -> Tuple[list, dict]: @@ -77,7 +154,7 @@ class Scanner37(Scanner37Base): if t.kind.startswith("BUILD_CONST_KEY_MAP") else t.kind.split("_")[1] ) - new_tokens = self.bound_collection( + new_tokens = self.bound_collection_from_tokens( tokens, new_tokens, t, i, f"CONST_{collection_type}" ) continue diff --git a/uncompyle6/semantics/n_actions.py b/uncompyle6/semantics/n_actions.py index d108b742..30cb0fe0 100644 --- a/uncompyle6/semantics/n_actions.py +++ b/uncompyle6/semantics/n_actions.py @@ -227,7 +227,7 @@ class NonterminalActions: self.indent_more(INDENT_PER_LEVEL) sep = "" if is_dict: - keys = flat_elems[-1].pattr + keys = flat_elems[-1].attr assert isinstance(keys, tuple) assert len(keys) == len(flat_elems) - 1 for i, elem in enumerate(flat_elems[:-1]): @@ -721,8 +721,8 @@ class NonterminalActions: def n_import_from(self, node): relative_path_index = 0 if self.version >= (2, 5): - if node[relative_path_index].pattr > 0: - node[2].pattr = ("." * node[relative_path_index].pattr) + node[2].pattr + if node[relative_path_index].attr > 0: + node[2].pattr = ("." * node[relative_path_index].attr) + node[2].pattr if self.version > (2, 7): if isinstance(node[1].pattr, tuple): imports = node[1].pattr