3.6+ extended arg handling; sync with decompyle3..

Use 3.8 scanner now. TODO: Need to investigate what's up with 3.7/01_extended_arg.py
2025-08-04 01:09:52 +08:00 · 2020-01-23 13:35:22 -05:00
parent eeb48818f3
commit 28a80a0132
4 changed files with 88 additions and 38 deletions
--- a/test/bytecode_3.7/01_extended_arg.pyc-notyet
+++ b/test/bytecode_3.7/01_extended_arg.pyc-notyet
--- a/test/stdlib/3.8-exclude.sh
+++ b/test/stdlib/3.8-exclude.sh
@@ -7,7 +7,7 @@ SKIP_TESTS=(
    [test_baseexception.py]=1  #
    [test_bdb.py]=1  #
    [test_buffer.py]=1  # parse error
-    [test_builtin.py]=1  # parser error
+    [test_builtin.py]=1  # parse error
    [test_clinic.py]=1 # it fails on its own
    [test_cmath.py]=1 # test assertion failure
    [test_cmd_line.py]=1  # Interactive?
--- a/uncompyle6/scanners/scanner38.py
+++ b/uncompyle6/scanners/scanner38.py
@@ -1,4 +1,4 @@
-#  Copyright (c) 2019 by Rocky Bernstein
+#  Copyright (c) 2019-2020 by Rocky Bernstein
 #
 #  This program is free software: you can redistribute it and/or modify
 #  it under the terms of the GNU General Public License as published by
@@ -12,15 +12,17 @@
 #
 #  You should have received a copy of the GNU General Public License
 #  along with this program.  If not, see <http://www.gnu.org/licenses/>.
-"""Python 3.8 bytecode decompiler scanner
+"""
+Python 3.8 bytecode decompiler scanner.

-Does some token massaging of xdis-disassembled instructions to make
-things easier for decompilation.
+Does some additional massaging of xdis-disassembled instructions to
+make things easier for decompilation.

 This sets up opcodes Python's 3.8 and calls a generalized
 scanner routine for Python 3.7 and up.
 """

+from uncompyle6.scanners.tok import off2int
 from uncompyle6.scanners.scanner37 import Scanner37
 from uncompyle6.scanners.scanner37base import Scanner37Base

@@ -34,6 +36,7 @@ JUMP_OPs = opc.JUMP_OPS
 class Scanner38(Scanner37):
    def __init__(self, show_asm=None):
        Scanner37Base.__init__(self, 3.8, show_asm)
+        self.debug = False
        return

    pass
@@ -42,30 +45,77 @@ class Scanner38(Scanner37):
        tokens, customize = super(Scanner38, self).ingest(
            co, classname, code_objects, show_asm
        )
+
+        # Hacky way to detect loop ranges.
+        # The key in jump_back_targets is the start of the loop.
+        # The value is where the loop ends. In current Python,
+        # JUMP_BACKS are always to loops. And blocks are ordered so that the
+        # JUMP_BACK with the highest offset will be where the range ends.
+        jump_back_targets = {}
+        for token in tokens:
+            if token.kind == "JUMP_BACK":
+                jump_back_targets[token.attr] = token.offset
+                pass
+            pass
+
+        if self.debug and jump_back_targets:
+            print(jump_back_targets)
+        loop_ends: List[int] = []
+        next_end = tokens[len(tokens)-1].off2int() + 10
        for i, token in enumerate(tokens):
            opname = token.kind
-            if opname in ("JUMP_FORWARD", "JUMP_ABSOLUTE"):
-                # Turn JUMPs into BREAK_LOOP
+            offset = token.offset
+            if offset == next_end:
+                loop_ends.pop()
+                if self.debug:
+                    print(f"{'  ' * len(loop_ends)}remove loop offset {offset}")
+                    pass
+                next_end = loop_ends[-1] if len(loop_ends) else tokens[len(tokens)-1].off2int() + 10
+
+            if offset in jump_back_targets:
+                next_end = off2int(jump_back_targets[offset], prefer_last=False)
+                if self.debug:
+                    print(f"{'  ' * len(loop_ends)}adding loop offset {offset} ending at {next_end}")
+                loop_ends.append(next_end)
+
+            # Turn JUMP opcodes into "BREAK_LOOP" opcodes.
+            # FIXME: this should be replaced by proper control flow.
+            if opname in ("JUMP_FORWARD", "JUMP_ABSOLUTE") and len(loop_ends):
                jump_target = token.attr

-                if opname == "JUMP_ABSOLUTE" and token.offset >= jump_target:
-                    # Not a forward jump, so continue
+                if opname == "JUMP_ABSOLUTE" and jump_target <= next_end:
+                    # Not a forward-enough jump to break out of the next loop, so continue.
                    # FIXME: Do we need "continue" detection?
                    continue
+
+                # We also want to avoid confusing BREAK_LOOPS with parts of the
+                # grammar rules for loops. (Perhaps we should change the grammar.)
+                # Try to find an adjacent JUMP_BACK which is part of the normal loop end.
+
                if i + 1 < len(tokens) and tokens[i + 1] == "JUMP_BACK":
-                    # Sometimes the jump back is *after* the break...
+                    # Sometimes the jump back is after the "break" instruction..
                    jump_back_index = i + 1
                else:
-                    # and sometimes it is *before* where we jumped to.
+                    # and sometimes, because of jump-to-jump optimization, it is before the
+                    # jump target instruction.
                    jump_back_index = self.offset2tok_index[jump_target] - 1
                    while tokens[jump_back_index].kind.startswith("COME_FROM_"):
                        jump_back_index -= 1
                        pass
                    pass
                jump_back_token = tokens[jump_back_index]
-                if (
+
+                # Is this a forward jump not next to a JUMP_BACK ? ...
+                break_loop = (
+                    token.linestart
+                    and jump_back_token != "JUMP_BACK"
+                )
+
+                # or if there is looping jump back, then that loop
+                # should start before where the "break" instruction sits.
+                if break_loop or (
                    jump_back_token == "JUMP_BACK"
-                    and jump_back_token.attr < token.offset
+                    and jump_back_token.attr < token.off2int()
                ):
                    token.kind = "BREAK_LOOP"
                pass
--- a/uncompyle6/scanners/tok.py
+++ b/uncompyle6/scanners/tok.py
@@ -22,6 +22,28 @@ if PYTHON3:
    intern = sys.intern


+def off2int(offset, prefer_last=True):
+    if isinstance(offset, int):
+        return offset
+    else:
+        assert isinstance(offset, str)
+        offsets = list(map(int, offset.split("_")))
+        if len(offsets) == 1:
+            return offsets[0]
+        else:
+            assert len(offsets) == 2
+            offset_1, offset_2 = offsets
+        if offset_1 + 2 == offset_2:
+            # This is an instruction with an extended arg.
+            # For things that compare against offsets, we generally want the
+            # later offset.
+            return offset_2 if prefer_last else offset_1
+        else:
+            # Probably a "COME_FROM"-type offset, where the second number
+            # is just a count, and not really an offset.
+            return offset_1
+
+
 class Token:
    """
    Class representing a byte-code instruction.
@@ -44,7 +66,7 @@ class Token:
        op=None,
        has_arg=None,
        opc=None,
-        has_extended_arg=False
+        has_extended_arg=False,
    ):
        self.kind = intern(opname)
        self.has_arg = has_arg
@@ -165,29 +187,7 @@ class Token:
        raise IndexError

    def off2int(self, prefer_last=True):
-        if isinstance(self.offset, int):
-            return self.offset
-        else:
-            assert isinstance(self.offset, str)
-            offsets = list(map(int, self.offset.split("_")))
-            if len(offsets) == 1:
-                return offsets[0]
-            else:
-                assert len(offsets) == 2
-                offset_1, offset_2 = offsets
-            if offset_1 + 2 == offset_2:
-                # This is an instruction with an extended arg.
-                # For things that compare against offsets, we generally want the
-                # later offset.
-                if prefer_last:
-                    return offset_2
-                else:
-                    return offset_1
-            else:
-                # Probably a "COME_FROM"-type offset, where the second number
-                # is just a count, and not really an offset.
-                return offset_1
-            return(int(self.offset.split("_")[0]))
+        return off2int(self.offset)


 NoneToken = Token("LOAD_CONST", offset=-1, attr=None, pattr=None)