diff --git a/test/bytecode_3.3/03_ifelse_in_lambda.pyc b/test/bytecode_3.3/03_ifelse_in_lambda.pyc new file mode 100644 index 00000000..7628206f Binary files /dev/null and b/test/bytecode_3.3/03_ifelse_in_lambda.pyc differ diff --git a/test/bytecode_3.3/03_map.pyc b/test/bytecode_3.3/03_map.pyc index 9712c1ea..ef181d74 100644 Binary files a/test/bytecode_3.3/03_map.pyc and b/test/bytecode_3.3/03_map.pyc differ diff --git a/test/bytecode_3.4/03_ifelse_in_lambda.pyc b/test/bytecode_3.4/03_ifelse_in_lambda.pyc new file mode 100644 index 00000000..01df985d Binary files /dev/null and b/test/bytecode_3.4/03_ifelse_in_lambda.pyc differ diff --git a/test/bytecode_3.4/03_map.pyc b/test/bytecode_3.4/03_map.pyc index 7834fb41..b7922a8b 100644 Binary files a/test/bytecode_3.4/03_map.pyc and b/test/bytecode_3.4/03_map.pyc differ diff --git a/test/bytecode_3.5/02_for_else_bug.pyc b/test/bytecode_3.5/02_for_else_bug.pyc new file mode 100644 index 00000000..ace42316 Binary files /dev/null and b/test/bytecode_3.5/02_for_else_bug.pyc differ diff --git a/test/bytecode_3.5/03_ifelse_in_lambda.pyc b/test/bytecode_3.5/03_ifelse_in_lambda.pyc new file mode 100644 index 00000000..33b8757f Binary files /dev/null and b/test/bytecode_3.5/03_ifelse_in_lambda.pyc differ diff --git a/test/bytecode_3.6/09_long_whilestmt.pyc b/test/bytecode_3.6/09_long_whilestmt.pyc new file mode 100644 index 00000000..7612eb31 Binary files /dev/null and b/test/bytecode_3.6/09_long_whilestmt.pyc differ diff --git a/test/simple_source/bug34/03_ifelse_in_lambda.py b/test/simple_source/bug34/03_ifelse_in_lambda.py new file mode 100644 index 00000000..b498a009 --- /dev/null +++ b/test/simple_source/bug34/03_ifelse_in_lambda.py @@ -0,0 +1,4 @@ +# Next line is 1164 +def foo(): + name = "bar" + lambda x: compile(x, "" % name, "exec") if x else None diff --git a/test/simple_source/bug35/02_for_else_bug.py b/test/simple_source/bug35/02_for_else_bug.py new file mode 100644 index 00000000..c8f85ad8 --- /dev/null +++ b/test/simple_source/bug35/02_for_else_bug.py @@ -0,0 +1,10 @@ +# Adapted 3.5 from _bootstrap_external.py + + +def spec_from_file_location(loader, location): + if loader: + for _ in __file__: + if location: + break + else: + return None diff --git a/test/simple_source/bug36/09_long_whilestmt.py b/test/simple_source/bug36/09_long_whilestmt.py new file mode 100644 index 00000000..1cf98b6b --- /dev/null +++ b/test/simple_source/bug36/09_long_whilestmt.py @@ -0,0 +1,74 @@ +# From https://github.com/rocky/python-uncompyle6/issues/420 +# Related to EXTENDED_ARG in whilestmt +ERRPR_CODE_DEFINE = {} # Remove this and things works + +try: + print() +except Exception: + var1 = 0 + var2 = 1 + if var1 or var2: + times = 1 + while times != False and self.scanner.is_open(): + try: + try: + print() + except Exception: + print() + + out = 0 + count = 1 + if out == 1: + break + elif out == 2: + count += 1 + if times == 3: + self.func.emit({}) + break + else: + continue + if out == 3 or out == b"": + if self.times == 3: + break + count += 1 + if count == 3: + count = 0 + if out == 4: + self.func.emit(ERRPR_CODE_DEFINE.ReceiedError()) + else: + print() + break + continue + else: + count = 0 + except Exception: + print("upper exception") + else: + try: + print("jump forward") + while True: + out = self.func.read(count) + if out == b"": + self.func.emit(ERRPR_CODE_DEFINE.ReceiedError()) + break + continue + imagedata = out[0] + if imagedata == b"\x05": + self.func.emit(INFORMATION.UnsupportedImage()) + break + continue + if imagedata == b"\x15": + self.func.emit(INFORMATION.NoneImage()) + break + continue + if out[1] == False: + start_index = imagedata.find(b"BM6") + self.func.emit(imagedata[start_index:], False) + continue + (imagedata, all_code) = imagedata + self.func.emit({}) + self.func.emit({}) + self.func.emit({}) # remove {} and this works + break + except Exception: + pass diff --git a/uncompyle6/parsers/parse34.py b/uncompyle6/parsers/parse34.py index caae5e2e..6a21eb7a 100644 --- a/uncompyle6/parsers/parse34.py +++ b/uncompyle6/parsers/parse34.py @@ -53,6 +53,10 @@ class Python34Parser(Python33Parser): _ifstmts_jump ::= c_stmts_opt JUMP_ABSOLUTE JUMP_FORWARD COME_FROM genexpr_func ::= LOAD_ARG _come_froms FOR_ITER store comp_iter JUMP_BACK + + if_exp_lambda ::= expr jmp_false expr return_if_lambda come_froms return_stmt_lambda LAMBDA_MARKER + return_if_lambda ::= RETURN_END_IF_LAMBDA come_froms + return_if_stmt ::= return_expr RETURN_END_IF POP_BLOCK """ def customize_grammar_rules(self, tokens, customize): diff --git a/uncompyle6/parsers/parse35.py b/uncompyle6/parsers/parse35.py index cc07ccc2..8522198c 100644 --- a/uncompyle6/parsers/parse35.py +++ b/uncompyle6/parsers/parse35.py @@ -107,7 +107,6 @@ class Python35Parser(Python34Parser): # Python 3.5+ does jump optimization # In <.3.5 the below is a JUMP_FORWARD to a JUMP_ABSOLUTE. - return_if_stmt ::= return_expr RETURN_END_IF POP_BLOCK return_if_lambda ::= RETURN_END_IF_LAMBDA COME_FROM return ::= return_expr RETURN_END_IF diff --git a/uncompyle6/parsers/parse36.py b/uncompyle6/parsers/parse36.py index 53d792c3..08e7ae1a 100644 --- a/uncompyle6/parsers/parse36.py +++ b/uncompyle6/parsers/parse36.py @@ -52,6 +52,8 @@ class Python36Parser(Python35Parser): for_block ::= l_stmts_opt come_from_loops JUMP_BACK come_from_loops ::= COME_FROM_LOOP* + whilestmt ::= SETUP_LOOP testexpr l_stmts_opt + JUMP_BACK come_froms POP_BLOCK whilestmt ::= SETUP_LOOP testexpr l_stmts_opt JUMP_BACK come_froms POP_BLOCK COME_FROM_LOOP whilestmt ::= SETUP_LOOP testexpr l_stmts_opt diff --git a/uncompyle6/scanners/scanner2.py b/uncompyle6/scanners/scanner2.py index d73124c9..a1bd9cf5 100644 --- a/uncompyle6/scanners/scanner2.py +++ b/uncompyle6/scanners/scanner2.py @@ -491,7 +491,8 @@ class Scanner2(Scanner): if show_asm in ("both", "after"): print("\n# ---- tokenization:") - for t in new_tokens: + # FIXME: t.format() is changing tokens! + for t in new_tokens.copy(): print(t.format(line_prefix="")) print() return new_tokens, customize diff --git a/uncompyle6/scanners/scanner26.py b/uncompyle6/scanners/scanner26.py index b32e881a..e911968d 100755 --- a/uncompyle6/scanners/scanner26.py +++ b/uncompyle6/scanners/scanner26.py @@ -349,7 +349,8 @@ class Scanner26(Scanner2): if show_asm in ("both", "after"): print("\n# ---- tokenization:") - for t in tokens: + # FIXME: t.format() is changing tokens! + for t in tokens.copy(): print(t.format(line_prefix="")) print() return tokens, customize diff --git a/uncompyle6/scanners/scanner3.py b/uncompyle6/scanners/scanner3.py index 641a6cb8..1e18b980 100644 --- a/uncompyle6/scanners/scanner3.py +++ b/uncompyle6/scanners/scanner3.py @@ -203,7 +203,7 @@ class Scanner3(Scanner): self, insts, next_tokens, inst, t, i, collection_type ): """ - Try to a replace sequence of instruction that ends with a + Try to replace a sequence of instruction that ends with a BUILD_xxx with a sequence that can be parsed much faster, but inserting the token boundary at the beginning of the sequence. """ @@ -285,7 +285,7 @@ class Scanner3(Scanner): ) return new_tokens - def bound_map_from_inst(self, insts, next_tokens, inst, t, i): + def bound_map_from_inst(self, insts, next_tokens, t, i): """ Try to a sequence of instruction that ends with a BUILD_MAP into a sequence that can be parsed much faster, but inserting the @@ -300,25 +300,19 @@ class Scanner3(Scanner): if count < 5: return None - if self.version >= (3, 5): - # Newer Python BUILD_MAP argument's count is a - # key and value pair so it is multiplied by two. - collection_start = i - (count * 2) - assert (count * 2) <= i + # Newer Python BUILD_MAP argument's count is a + # key and value pair so it is multiplied by two. + collection_start = i - (count * 2) + assert (count * 2) <= i - for j in range(collection_start, i, 2): - if insts[j].opname not in ("LOAD_CONST",): - return None - if insts[j + 1].opname not in ("LOAD_CONST",): - return None + for j in range(collection_start, i, 2): + if insts[j].opname not in ("LOAD_CONST",): + return None + if insts[j + 1].opname not in ("LOAD_CONST",): + return None - collection_start = i - (2 * count) - collection_enum = CONST_COLLECTIONS.index("CONST_MAP") - # else: Older Python count is sum of all key and value pairs - # Each pair is added individually like: - # LOAD_CONST ("Max-Age") - # LOAD_CONST ("max-age") - # STORE_MAP + collection_start = i - (2 * count) + collection_enum = CONST_COLLECTIONS.index("CONST_MAP") # If we get here, all instructions before tokens[i] are LOAD_CONST and # we can replace add a boundary marker and change LOAD_CONST to @@ -331,7 +325,7 @@ class Scanner3(Scanner): attr=collection_enum, pattr="CONST_MAP", offset="%s_0" % start_offset, - linestart=False, + linestart=insts[collection_start].starts_line, has_arg=True, has_extended_arg=False, opc=self.opc, @@ -349,6 +343,7 @@ class Scanner3(Scanner): has_arg=True, has_extended_arg=False, opc=self.opc, + optype="pseudo", ) ) new_tokens.append( @@ -361,7 +356,7 @@ class Scanner3(Scanner): has_arg=True, has_extended_arg=False, opc=self.opc, - optype=insts[j + 1].optype, + optype="pseudo", ) ) new_tokens.append( @@ -374,7 +369,93 @@ class Scanner3(Scanner): has_arg=t.has_arg, has_extended_arg=False, opc=t.opc, - optype=t.optype, + optype="pseudo", + ) + ) + return new_tokens + + def bound_map_from_inst_pre35( + self, insts: list, next_tokens: list, t: Token, i: int + ): + """ + Try to a sequence of instruction that ends with a BUILD_MAP into + a sequence that can be parsed much faster, but inserting the + token boundary at the beginning of the sequence. + """ + count = t.attr + assert isinstance(count, int) + + # For small lists don't bother + if count < 10: + return None + + # Older Python BUILD_MAP argument's count is a + # key and value pair and STORE_MAP. So it is multiplied by three. + collection_end = i + 1 + count * 3 + + for j in range(i + 1, collection_end, 3): + if insts[j].opname not in ("LOAD_CONST",): + return None + if insts[j + 1].opname not in ("LOAD_CONST",): + return None + if insts[j + 2].opname not in ("STORE_MAP",): + return None + + collection_enum = CONST_COLLECTIONS.index("CONST_MAP") + + new_tokens = next_tokens[:i] + start_offset = insts[i].offset + new_tokens.append( + Token( + opname="COLLECTION_START", + attr=collection_enum, + pattr="CONST_MAP", + offset="%s_0" % start_offset, + linestart=insts[i].starts_line, + has_arg=True, + has_extended_arg=False, + opc=self.opc, + optype="pseudo", + ) + ) + for j in range(i + 1, collection_end, 3): + new_tokens.append( + Token( + opname="ADD_KEY", + attr=insts[j + 1].argval, + pattr=insts[j + 1].argrepr, + offset=insts[j + 1].offset, + linestart=insts[j + 1].starts_line, + has_arg=True, + has_extended_arg=False, + opc=self.opc, + optype="pseudo", + ) + ) + new_tokens.append( + Token( + opname="ADD_VALUE", + attr=insts[j].argval, + pattr=insts[j].argrepr, + offset=insts[j].offset, + linestart=insts[j].starts_line, + has_arg=True, + has_extended_arg=False, + opc=self.opc, + optype="pseudo", + ) + ) + new_tokens.append( + Token( + opname="BUILD_DICT_OLDER", + attr=t.attr, + pattr=t.pattr, + offset=t.offset, + linestart=t.linestart, + has_arg=t.has_arg, + has_extended_arg=False, + opc=t.opc, + optype="pseudo", ) ) return new_tokens @@ -483,8 +564,17 @@ class Scanner3(Scanner): last_op_was_break = False new_tokens = [] + skip_end_offset = None for i, inst in enumerate(self.insts): + + # BUILD_MAP for < 3.5 can skip *forward* in instructions and + # replace them. So we use the below to get up to the position + # scanned and replaced forward + if skip_end_offset and inst.offset <= skip_end_offset: + continue + skip_end_offset = None + opname = inst.opname argval = inst.argval pattr = inst.argrepr @@ -517,17 +607,38 @@ class Scanner3(Scanner): if try_tokens is not None: new_tokens = try_tokens continue - elif opname in ("BUILD_MAP",) and self.version >= (3, 5): - try_tokens = self.bound_map_from_inst( + + elif opname in ("BUILD_MAP",): + bound_map_from_insts_fn = ( + self.bound_map_from_inst_35 + if self.version >= (3, 5) + else self.bound_map_from_inst_pre35 + ) + try_tokens = bound_map_from_insts_fn( self.insts, new_tokens, - inst, t, i, ) if try_tokens is not None: - new_tokens = try_tokens - continue + if self.version < (3, 5): + assert try_tokens[-1] == "BUILD_DICT_OLDER" + prev_offset = inst.offset + for j in range(i, len(self.insts)): + if self.insts[j].opname == "STORE_NAME": + new_tokens = try_tokens + skip_end_offset = prev_offset + # Set a hacky sentinal to indicate skipping to the + # next instruction + opname = "EXTENDED_ARG" + break + prev_offset = self.insts[j].offset + pass + pass + else: + new_tokens = try_tokens + continue + pass argval = inst.argval op = inst.opcode @@ -786,7 +897,8 @@ class Scanner3(Scanner): if show_asm in ("both", "after"): print("\n# ---- tokenization:") - for t in new_tokens: + # FIXME: t.format() is changing tokens! + for t in new_tokens.copy(): print(t.format(line_prefix="")) print() return new_tokens, customize diff --git a/uncompyle6/scanners/scanner37base.py b/uncompyle6/scanners/scanner37base.py index 64e5ab5a..afe6de66 100644 --- a/uncompyle6/scanners/scanner37base.py +++ b/uncompyle6/scanners/scanner37base.py @@ -225,13 +225,13 @@ class Scanner37Base(Scanner): if show_asm in ("both", "before"): print("\n# ---- disassembly:") - self.insts = bytecode.disassemble_bytes( + bytecode.disassemble_bytes( co.co_code, varnames=co.co_varnames, names=co.co_names, constants=co.co_consts, cells=bytecode._cell_names, - linestarts=bytecode._linestarts, + line_starts=bytecode._linestarts, asm_format="extended", filename=co.co_filename, show_source=True, @@ -478,12 +478,17 @@ class Scanner37Base(Scanner): next_opname = self.insts[i + 1].opname # 'Continue's include jumps to loops that are not - # and the end of a block which follow with POP_BLOCK and COME_FROM_LOOP. - # If the JUMP_ABSOLUTE is to a FOR_ITER and it is followed by another JUMP_FORWARD - # then we'll take it as a "continue". - is_continue = ( - self.insts[self.offset2inst_index[target]].opname == "FOR_ITER" - and self.insts[i + 1].opname == "JUMP_FORWARD" + # and the end of a block which follow with + # POP_BLOCK and COME_FROM_LOOP. If the + # JUMP_ABSOLUTE is to a FOR_ITER, and it is + # followed by another JUMP_FORWARD then we'll take + # it as a "continue". + next_inst = self.insts[i + 1] + is_continue = self.insts[ + self.offset2inst_index[target] + ].opname == "FOR_ITER" and next_inst.opname in ( + "JUMP_FORWARD", + "JUMP_ABSOLUTE", ) if self.version < (3, 8) and ( @@ -498,21 +503,65 @@ class Scanner37Base(Scanner): ): opname = "CONTINUE" else: + # "continue" versus "break_loop" dectction is more complicated + # because "continue" to an outer loop is really a "break loop" opname = "JUMP_BACK" + # FIXME: this is a hack to catch stuff like: # if x: continue # the "continue" is not on a new line. - # There are other situations where we don't catch - # CONTINUE as well. - if tokens[-1].kind == "JUMP_BACK" and tokens[-1].attr <= argval: + # + # Another situation is where we have + # for method in methods: + # for B in method: + # if c: + # return + # break # A "continue" but not the innermost one + if tokens[-1].kind == "JUMP_LOOP" and tokens[-1].attr <= argval: if tokens[-2].kind == "BREAK_LOOP": del tokens[-1] + j -= 1 else: - # intern is used because we are changing the *previous* token - tokens[-1].kind = sys.intern("CONTINUE") - if last_op_was_break and opname == "CONTINUE": - last_op_was_break = False - continue + # "intern" is used because we are + # changing the *previous* token. A + # POP_TOP suggests a "break" rather + # than a "continue"? + if tokens[-2] == "POP_TOP" and ( + is_continue and next_inst.argval != tokens[-1].attr + ): + tokens[-1].kind = sys.intern("BREAK_LOOP") + else: + tokens[-1].kind = sys.intern("CONTINUE") + last_continue = tokens[-1] + pass + pass + pass + # elif ( + # last_continue is not None + # and tokens[-1].kind == "JUMP_LOOP" + # and last_continue.attr <= tokens[-1].attr + # and last_continue.offset > tokens[-1].attr + # ): + # # Handle mis-characterized "CONTINUE" + # # We have a situation like: + # # loop ... for or while) + # # loop + # # if ...: # code below starts here + # # break # not continue + # # + # # POP_JUMP_IF_FALSE_LOOP # to outer loop + # # JUMP_LOOP # to inner loop + # # ... + # # JUMP_LOOP # to outer loop + # tokens[-2].kind = sys.intern("BREAK_LOOP") + # pass + + # if last_op_was_break and opname == "CONTINUE": + # last_op_was_break = False + # continue + pass + else: + opname = "JUMP_FORWARD" elif inst.offset in self.load_asserts: opname = "LOAD_ASSERT" @@ -535,9 +584,10 @@ class Scanner37Base(Scanner): ) pass - if show_asm in ("both", "after"): + if show_asm in ("both", "after") and self.version < (3, 8): print("\n# ---- tokenization:") - for t in tokens: + # FIXME: t.format() is changing tokens! + for t in tokens.copy(): print(t.format(line_prefix="")) print() return tokens, customize diff --git a/uncompyle6/scanners/scanner38.py b/uncompyle6/scanners/scanner38.py index 16858328..66142eea 100644 --- a/uncompyle6/scanners/scanner38.py +++ b/uncompyle6/scanners/scanner38.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2022 by Rocky Bernstein +# Copyright (c) 2019-2022, 2024 by Rocky Bernstein # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -117,35 +117,26 @@ class Scanner38(Scanner37): new_tokens.append(token) continue - # We also want to avoid confusing BREAK_LOOPS with parts of the - # grammar rules for loops. (Perhaps we should change the grammar.) - # Try to find an adjacent JUMP_BACK which is part of the normal loop end. + j = i + while tokens[j - 1] in ("POP_TOP", "POP_BLOCK", "POP_EXCEPT"): + j -= 1 + if tokens[j].linestart: + break + token_with_linestart = tokens[j] - if i + 1 < len(tokens) and tokens[i + 1] == "JUMP_BACK": - # Sometimes the jump back is after the "break" instruction.. - jump_back_index = i + 1 - else: - # and sometimes, because of jump-to-jump optimization, it is before the - # jump target instruction. - jump_back_index = self.offset2tok_index[jump_target] - 1 - while tokens[jump_back_index].kind.startswith("COME_FROM_"): - jump_back_index -= 1 - pass - pass - jump_back_token = tokens[jump_back_index] - - # Is this a forward jump not next to a JUMP_BACK ? ... - break_loop = token.linestart and jump_back_token != "JUMP_BACK" - - # or if there is looping jump back, then that loop - # should start before where the "break" instruction sits. - if break_loop or ( - jump_back_token == "JUMP_BACK" - and jump_back_token.attr < token.off2int() - ): + if token_with_linestart.linestart: token.kind = "BREAK_LOOP" + pass new_tokens.append(token) + + if show_asm in ("both", "after"): + print("\n# ---- tokenization:") + # FIXME: t.format() is changing tokens! + for t in new_tokens.copy(): + print(t.format(line_prefix="")) + print() + return new_tokens, customize diff --git a/uncompyle6/scanners/tok.py b/uncompyle6/scanners/tok.py index f76d1d4f..cc491d77 100644 --- a/uncompyle6/scanners/tok.py +++ b/uncompyle6/scanners/tok.py @@ -1,4 +1,4 @@ -# Copyright (c) 2016-2021, 2023 by Rocky Bernstein +# Copyright (c) 2016-2021, 2023-2024 by Rocky Bernstein # Copyright (c) 2000-2002 by hartmut Goebel # Copyright (c) 1999 John Aycock #