# Copyright (c) 2015-2017, 2021-2022, 2024 by Rocky Bernstein # Copyright (c) 2005 by Dan Pascu # Copyright (c) 2000-2002 by hartmut Goebel # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . """ Python 2.6 bytecode scanner This overlaps Python's 2.6's dis module, but it can be run from Python 3 and other versions of Python. Also, we save token information for later use in deparsing. """ import sys # bytecode verification, verify(), uses JUMP_OPs from here from xdis import iscode from xdis.bytecode import _get_const_info from xdis.opcodes import opcode_26 import uncompyle6.scanners.scanner2 as scan from uncompyle6.scanner import Token intern = sys.intern JUMP_OPS = opcode_26.JUMP_OPS class Scanner26(scan.Scanner2): def __init__(self, show_asm=False): super(Scanner26, self).__init__((2, 6), show_asm) # "setup" opcodes self.setup_ops = frozenset( [ self.opc.SETUP_EXCEPT, self.opc.SETUP_FINALLY, ] ) return def ingest(self, co, classname=None, code_objects={}, show_asm=None): """Create "tokens" the bytecode of an Python code object. Largely these are the opcode name, but in some cases that has been modified to make parsing easier. returning a list of uncompyle6 Token's. Some transformations are made to assist the deparsing grammar: - various types of LOAD_CONST's are categorized in terms of what they load - COME_FROM instructions are added to assist parsing control structures - operands with stack argument counts or flag masks are appended to the opcode name, e.g.: * BUILD_LIST, BUILD_SET * MAKE_FUNCTION and FUNCTION_CALLS append the number of positional arguments - EXTENDED_ARGS instructions are removed Also, when we encounter certain tokens, we add them to a set which will cause custom grammar rules. Specifically, variable arg tokens like MAKE_FUNCTION or BUILD_LIST cause specific rules for the specific number of arguments they take. """ if not show_asm: show_asm = self.show_asm bytecode = self.build_instructions(co) # show_asm = 'after' if show_asm in ("both", "before"): print("\n# ---- disassembly:") bytecode.disassemble_bytes( co.co_code, varnames=co.co_varnames, names=co.co_names, constants=co.co_consts, cells=bytecode._cell_names, line_starts=bytecode._linestarts, asm_format="extended", ) # Container for tokens tokens = [] customize = {} if self.is_pypy: customize["PyPy"] = 0 codelen = len(self.code) free, names, varnames = self.unmangle_code_names(co, classname) self.names = names # Scan for assertions. Later we will # turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'. # 'LOAD_ASSERT' is used in assert statements. self.load_asserts = set() for i in self.op_range(0, codelen): # We need to detect the difference between: # raise AssertionError # and # assert ... if ( self.code[i] == self.opc.JUMP_IF_TRUE and i + 4 < codelen and self.code[i + 3] == self.opc.POP_TOP and self.code[i + 4] == self.opc.LOAD_GLOBAL ): if names[self.get_argument(i + 4)] == "AssertionError": self.load_asserts.add(i + 4) jump_targets = self.find_jump_targets(show_asm) # contains (code, [addrRefToCode]) last_stmt = self.next_stmt[0] i = self.next_stmt[last_stmt] replace = {} while i < codelen - 1: if self.lines and self.lines[last_stmt].next > i: # Distinguish "print ..." from "print ...," if self.code[last_stmt] == self.opc.PRINT_ITEM: if self.code[i] == self.opc.PRINT_ITEM: replace[i] = "PRINT_ITEM_CONT" elif self.code[i] == self.opc.PRINT_NEWLINE: replace[i] = "PRINT_NEWLINE_CONT" last_stmt = i i = self.next_stmt[i] extended_arg = 0 i = -1 for offset in self.op_range(0, codelen): i += 1 op = self.code[offset] op_name = self.opname[op] oparg = None pattr = None if offset in jump_targets: jump_idx = 0 # We want to process COME_FROMs to the same offset to be in *descending* # offset order so we have the larger range or biggest instruction interval # last. (I think they are sorted in increasing order, but for safety # we sort them). That way, specific COME_FROM tags will match up # properly. For example, a "loop" with an "if" nested in it should have the # "loop" tag last so the grammar rule matches that properly. last_jump_offset = -1 for jump_offset in sorted(jump_targets[offset], reverse=True): if jump_offset != last_jump_offset: tokens.append( Token( "COME_FROM", jump_offset, repr(jump_offset), offset="%s_%d" % (offset, jump_idx), has_arg=True, ) ) jump_idx += 1 last_jump_offset = jump_offset elif offset in self.thens: tokens.append( Token( "THEN", None, self.thens[offset], offset="%s_0" % offset, has_arg=True, ) ) has_arg = op >= self.opc.HAVE_ARGUMENT if has_arg: oparg = self.get_argument(offset) + extended_arg extended_arg = 0 if op == self.opc.EXTENDED_ARG: extended_arg += self.extended_arg_val(oparg) continue # Note: name used to match on rather than op since # BUILD_SET isn't in earlier Pythons. if op_name in ( "BUILD_LIST", "BUILD_SET", ): t = Token( op_name, oparg, pattr, offset, self.linestarts.get(offset, None), op, has_arg, self.opc, ) collection_type = op_name.split("_")[1] next_tokens = self.bound_collection_from_tokens( tokens, t, len(tokens), "CONST_%s" % collection_type ) if next_tokens is not None: tokens = next_tokens continue if op in self.opc.CONST_OPS: const = co.co_consts[oparg] if iscode(const): oparg = const if const.co_name == "": assert op_name == "LOAD_CONST" op_name = "LOAD_LAMBDA" elif const.co_name == self.genexpr_name: op_name = "LOAD_GENEXPR" elif const.co_name == "": op_name = "LOAD_DICTCOMP" elif const.co_name == "": op_name = "LOAD_SETCOMP" else: op_name = "LOAD_CODE" # verify() uses 'pattr' for comparison, since 'attr' # now holds Code(const) and thus can not be used # for comparison (todo: think about changing this) # pattr = 'code_object @ 0x%x %s->%s' %\ # (id(const), const.co_filename, const.co_name) pattr = "" else: if oparg < len(co.co_consts): argval, _ = _get_const_info(oparg, co.co_consts) # Why don't we use _ above for "pattr" rather than "const"? # This *is* a little hoaky, but we have to coordinate with # other parts like n_LOAD_CONST in pysource.py for example. pattr = const pass elif op in self.opc.NAME_OPS: pattr = names[oparg] elif op in self.opc.JREL_OPS: pattr = repr(offset + 3 + oparg) if op == self.opc.JUMP_FORWARD: target = self.get_target(offset) # FIXME: this is a hack to catch stuff like: # if x: continue # the "continue" is not on a new line. if len(tokens) and tokens[-1].kind == "JUMP_BACK": tokens[-1].kind = intern("CONTINUE") elif op in self.opc.JABS_OPS: pattr = repr(oparg) elif op in self.opc.LOCAL_OPS: if self.version < (1, 5): pattr = names[oparg] else: pattr = varnames[oparg] elif op in self.opc.COMPARE_OPS: pattr = self.opc.cmp_op[oparg] elif op in self.opc.FREE_OPS: pattr = free[oparg] if op in self.varargs_ops: # CE - Hack for >= 2.5 # Now all values loaded via LOAD_CLOSURE are packed into # a tuple before calling MAKE_CLOSURE. if ( self.version >= (2, 5) and op == self.opc.BUILD_TUPLE and self.code[self.prev[offset]] == self.opc.LOAD_CLOSURE ): continue else: op_name = "%s_%d" % (op_name, oparg) customize[op_name] = oparg elif self.version > (2, 0) and op == self.opc.CONTINUE_LOOP: customize[op_name] = 0 elif ( op_name in """ CONTINUE_LOOP EXEC_STMT LOAD_LISTCOMP LOAD_SETCOMP """.split() ): customize[op_name] = 0 elif op == self.opc.JUMP_ABSOLUTE: # Further classify JUMP_ABSOLUTE into backward jumps # which are used in loops, and "CONTINUE" jumps which # may appear in a "continue" statement. The loop-type # and continue-type jumps will help us classify loop # boundaries The continue-type jumps help us get # "continue" statements with would otherwise be turned # into a "pass" statement because JUMPs are sometimes # ignored in rules as just boundary overhead. In # comprehensions we might sometimes classify JUMP_BACK # as CONTINUE, but that's okay since we add a grammar # rule for that. target = self.get_target(offset) if target <= offset: op_name = "JUMP_BACK" if offset in self.stmts and self.code[offset + 3] not in ( self.opc.END_FINALLY, self.opc.POP_BLOCK, ): if ( offset in self.linestarts and tokens[-1].kind == "JUMP_BACK" ) or offset not in self.not_continue: op_name = "CONTINUE" else: # FIXME: this is a hack to catch stuff like: # if x: continue # the "continue" is not on a new line. if tokens[-1].kind == "JUMP_BACK": # We need 'intern' since we have # already have processed the previous # token. tokens[-1].kind = intern("CONTINUE") elif op == self.opc.LOAD_GLOBAL: if offset in self.load_asserts: op_name = "LOAD_ASSERT" elif op == self.opc.RETURN_VALUE: if offset in self.return_end_ifs: op_name = "RETURN_END_IF" linestart = self.linestarts.get(offset, None) if offset not in replace: tokens.append( Token( op_name, oparg, pattr, offset, linestart, op, has_arg, self.opc ) ) else: tokens.append( Token( replace[offset], oparg, pattr, offset, linestart, op, has_arg, self.opc, ) ) pass pass if show_asm in ("both", "after"): print("\n# ---- tokenization:") # FIXME: t.format() is changing tokens! for t in tokens.copy(): print(t.format(line_prefix="")) print() return tokens, customize if __name__ == "__main__": from xdis.version_info import PYTHON_VERSION_TRIPLE, version_tuple_to_str if PYTHON_VERSION_TRIPLE[:2] == (2, 6): import inspect co = inspect.currentframe().f_code # type: ignore tokens, customize = Scanner26().ingest(co) for t in tokens: print(t.format()) pass else: print("Need to be Python 2.6 to demo; I am version %s" % version_tuple_to_str())