You've already forked python-uncompyle6
mirror of
https://github.com/rocky/python-uncompyle6.git
synced 2025-08-04 01:09:52 +08:00
As part of tokenization for (de)parsing, we need to do something like a disassembly, but is is really a little different. Disassembly, strictly speaking, is done by the xdis module now. What "ingestion" does is massage the instruction tokens to a form that is more amenable for parsing. In sum, ingestion is different than disassembly, although disassembly is generally the first part of ingestion.
406 lines
16 KiB
Python
Executable File
406 lines
16 KiB
Python
Executable File
#
|
|
# (C) Copyright 2000-2002 by hartmut Goebel <h.goebel@crazy-compilers.com>
|
|
# (C) Copyright 2015-2016 by Rocky Bernstein
|
|
#
|
|
"""
|
|
byte-code verification
|
|
"""
|
|
|
|
from __future__ import print_function
|
|
|
|
import dis, operator
|
|
|
|
import uncompyle6
|
|
import uncompyle6.scanner as scanner
|
|
from uncompyle6 import PYTHON3
|
|
from xdis.code import iscode
|
|
from xdis.magics import PYTHON_MAGIC_INT
|
|
from xdis.load import load_file, load_module
|
|
from xdis.util import pretty_flags
|
|
|
|
# FIXME: DRY
|
|
if PYTHON3:
|
|
truediv = operator.truediv
|
|
from functools import reduce
|
|
else:
|
|
truediv = operator.div
|
|
|
|
|
|
def code_equal(a, b):
|
|
return a.co_code == b.co_code
|
|
|
|
BIN_OP_FUNCS = {
|
|
'BINARY_POWER': operator.pow,
|
|
'BINARY_MULTIPLY': operator.mul,
|
|
'BINARY_DIVIDE': truediv,
|
|
'BINARY_FLOOR_DIVIDE': operator.floordiv,
|
|
'BINARY_TRUE_DIVIDE': operator.truediv,
|
|
'BINARY_MODULO' : operator.mod,
|
|
'BINARY_ADD': operator.add,
|
|
'BINARY_SUBRACT': operator.sub,
|
|
'BINARY_LSHIFT': operator.lshift,
|
|
'BINARY_RSHIFT': operator.rshift,
|
|
'BINARY_AND': operator.and_,
|
|
'BINARY_XOR': operator.xor,
|
|
'BINARY_OR': operator.or_,
|
|
}
|
|
|
|
JUMP_OPs = None
|
|
|
|
# --- exceptions ---
|
|
|
|
class VerifyCmpError(Exception):
|
|
pass
|
|
|
|
class CmpErrorConsts(VerifyCmpError):
|
|
"""Exception to be raised when consts differ."""
|
|
def __init__(self, name, index):
|
|
self.name = name
|
|
self.index = index
|
|
|
|
def __str__(self):
|
|
return 'Compare Error within Consts of %s at index %i' % \
|
|
(repr(self.name), self.index)
|
|
|
|
class CmpErrorConstsType(VerifyCmpError):
|
|
"""Exception to be raised when consts differ."""
|
|
def __init__(self, name, index):
|
|
self.name = name
|
|
self.index = index
|
|
|
|
def __str__(self):
|
|
return 'Consts type differ in %s at index %i' % \
|
|
(repr(self.name), self.index)
|
|
|
|
class CmpErrorConstsLen(VerifyCmpError):
|
|
"""Exception to be raised when length of co_consts differs."""
|
|
def __init__(self, name, consts1, consts2):
|
|
self.name = name
|
|
self.consts = (consts1, consts2)
|
|
|
|
def __str__(self):
|
|
return 'Consts length differs in %s:\n\n%i:\t%s\n\n%i:\t%s\n\n' % \
|
|
(repr(self.name),
|
|
len(self.consts[0]), repr(self.consts[0]),
|
|
len(self.consts[1]), repr(self.consts[1]))
|
|
|
|
class CmpErrorCode(VerifyCmpError):
|
|
"""Exception to be raised when code differs."""
|
|
def __init__(self, name, index, token1, token2, tokens1, tokens2):
|
|
self.name = name
|
|
self.index = index
|
|
self.token1 = token1
|
|
self.token2 = token2
|
|
self.tokens = [tokens1, tokens2]
|
|
|
|
def __str__(self):
|
|
s = reduce(lambda s, t: "%s%-37s\t%-37s\n" % (s, t[0], t[1]),
|
|
list(map(lambda a, b: (a, b),
|
|
self.tokens[0],
|
|
self.tokens[1])),
|
|
'Code differs in %s\n' % str(self.name))
|
|
return ('Code differs in %s at offset %s [%s] != [%s]\n\n' %
|
|
(repr(self.name), self.index,
|
|
repr(self.token1), repr(self.token2))) + s
|
|
|
|
class CmpErrorCodeLen(VerifyCmpError):
|
|
"""Exception to be raised when code length differs."""
|
|
def __init__(self, name, tokens1, tokens2):
|
|
self.name = name
|
|
self.tokens = [tokens1, tokens2]
|
|
|
|
def __str__(self):
|
|
return reduce(lambda s, t: "%s%-37s\t%-37s\n" % (s, t[0], t[1]),
|
|
list(map(lambda a, b: (a, b),
|
|
self.tokens[0],
|
|
self.tokens[1])),
|
|
'Code len differs in %s\n' % str(self.name))
|
|
|
|
class CmpErrorMember(VerifyCmpError):
|
|
"""Exception to be raised when other members differ."""
|
|
def __init__(self, name, member, data1, data2):
|
|
self.name = name
|
|
self.member = member
|
|
self.data = (data1, data2)
|
|
|
|
def __str__(self):
|
|
return 'Member %s differs in %s:\n\t%s\n\t%s\n' % \
|
|
(repr(self.member), repr(self.name),
|
|
repr(self.data[0]), repr(self.data[1]))
|
|
|
|
# --- compare ---
|
|
|
|
# these members are ignored
|
|
__IGNORE_CODE_MEMBERS__ = ['co_filename', 'co_firstlineno', 'co_lnotab', 'co_stacksize', 'co_names']
|
|
|
|
def cmp_code_objects(version, is_pypy, code_obj1, code_obj2,
|
|
name='', ignore_code=False):
|
|
"""
|
|
Compare two code-objects.
|
|
|
|
This is the main part of this module.
|
|
"""
|
|
# print code_obj1, type(code_obj2)
|
|
assert iscode(code_obj1), \
|
|
"cmp_code_object first object type is %s, not code" % type(code_obj1)
|
|
assert iscode(code_obj2), \
|
|
"cmp_code_object second object type is %s, not code" % type(code_obj2)
|
|
# print dir(code_obj1)
|
|
if isinstance(code_obj1, object):
|
|
# new style classes (Python 2.2)
|
|
# assume _both_ code objects to be new stle classes
|
|
assert dir(code_obj1) == dir(code_obj2)
|
|
else:
|
|
# old style classes
|
|
assert dir(code_obj1) == code_obj1.__members__
|
|
assert dir(code_obj2) == code_obj2.__members__
|
|
assert code_obj1.__members__ == code_obj2.__members__
|
|
|
|
if name == '__main__':
|
|
name = code_obj1.co_name
|
|
else:
|
|
name = '%s.%s' % (name, code_obj1.co_name)
|
|
if name == '.?': name = '__main__'
|
|
|
|
if isinstance(code_obj1, object) and code_equal(code_obj1, code_obj2):
|
|
# use the new style code-classes' __cmp__ method, which
|
|
# should be faster and more sophisticated
|
|
# if this compare fails, we use the old routine to
|
|
# find out, what exactly is nor equal
|
|
# if this compare succeds, simply return
|
|
# return
|
|
pass
|
|
|
|
if isinstance(code_obj1, object):
|
|
members = [x for x in dir(code_obj1) if x.startswith('co_')]
|
|
else:
|
|
members = dir(code_obj1)
|
|
members.sort() # ; members.reverse()
|
|
|
|
tokens1 = None
|
|
for member in members:
|
|
if member in __IGNORE_CODE_MEMBERS__ or ignore_code:
|
|
pass
|
|
elif member == 'co_code' and not ignore_code:
|
|
if version == 2.3:
|
|
import uncompyle6.scanners.scanner23 as scan
|
|
scanner = scan.Scanner26()
|
|
elif version == 2.4:
|
|
import uncompyle6.scanners.scanner24 as scan
|
|
scanner = scan.Scanner25()
|
|
elif version == 2.5:
|
|
import uncompyle6.scanners.scanner25 as scan
|
|
scanner = scan.Scanner25()
|
|
elif version == 2.6:
|
|
import uncompyle6.scanners.scanner26 as scan
|
|
scanner = scan.Scanner26()
|
|
elif version == 2.7:
|
|
if is_pypy:
|
|
import uncompyle6.scanners.pypy27 as scan
|
|
scanner = scan.ScannerPyPy27(show_asm=False)
|
|
else:
|
|
import uncompyle6.scanners.scanner27 as scan
|
|
scanner = scan.Scanner27()
|
|
elif version == 3.2:
|
|
if is_pypy:
|
|
import uncompyle6.scanners.pypy32 as scan
|
|
scanner = scan.ScannerPyPy32()
|
|
else:
|
|
import uncompyle6.scanners.scanner32 as scan
|
|
scanner = scan.Scanner32()
|
|
elif version == 3.3:
|
|
import uncompyle6.scanners.scanner33 as scan
|
|
scanner = scan.Scanner33()
|
|
elif version == 3.4:
|
|
import uncompyle6.scanners.scanner34 as scan
|
|
scanner = scan.Scanner34()
|
|
elif version == 3.5:
|
|
import uncompyle6.scanners.scanner35 as scan
|
|
scanner = scan.Scanner35()
|
|
elif version == 3.6:
|
|
import uncompyle6.scanners.scanner36 as scan
|
|
scanner = scan.Scanner36()
|
|
|
|
global JUMP_OPs
|
|
JUMP_OPs = list(scan.JUMP_OPs) + ['JUMP_BACK']
|
|
|
|
# use changed Token class
|
|
# We (re)set this here to save exception handling,
|
|
# which would get confusing.
|
|
scanner.setTokenClass(Token)
|
|
try:
|
|
# ingest both code-objects
|
|
tokens1, customize = scanner.ingest(code_obj1)
|
|
del customize # save memory
|
|
tokens2, customize = scanner.ingest(code_obj2)
|
|
del customize # save memory
|
|
finally:
|
|
scanner.resetTokenClass() # restore Token class
|
|
|
|
targets1 = dis.findlabels(code_obj1.co_code)
|
|
tokens1 = [t for t in tokens1 if t.type != 'COME_FROM']
|
|
tokens2 = [t for t in tokens2 if t.type != 'COME_FROM']
|
|
|
|
i1 = 0; i2 = 0
|
|
offset_map = {}; check_jumps = {}
|
|
while i1 < len(tokens1):
|
|
if i2 >= len(tokens2):
|
|
if len(tokens1) == len(tokens2) + 2 \
|
|
and tokens1[-1].type == 'RETURN_VALUE' \
|
|
and tokens1[-2].type == 'LOAD_CONST' \
|
|
and tokens1[-2].pattr is None \
|
|
and tokens1[-3].type == 'RETURN_VALUE':
|
|
break
|
|
else:
|
|
raise CmpErrorCodeLen(name, tokens1, tokens2)
|
|
|
|
offset_map[tokens1[i1].offset] = tokens2[i2].offset
|
|
|
|
for idx1, idx2, offset2 in check_jumps.get(tokens1[i1].offset, []):
|
|
if offset2 != tokens2[i2].offset:
|
|
raise CmpErrorCode(name, tokens1[idx1].offset, tokens1[idx1],
|
|
tokens2[idx2], tokens1, tokens2)
|
|
|
|
if tokens1[i1].type != tokens2[i2].type:
|
|
if tokens1[i1].type == 'LOAD_CONST' == tokens2[i2].type:
|
|
i = 1
|
|
while tokens1[i1+i].type == 'LOAD_CONST':
|
|
i += 1
|
|
if tokens1[i1+i].type.startswith(('BUILD_TUPLE', 'BUILD_LIST')) \
|
|
and i == int(tokens1[i1+i].type.split('_')[-1]):
|
|
t = tuple([ elem.pattr for elem in tokens1[i1:i1+i] ])
|
|
if t != tokens2[i2].pattr:
|
|
raise CmpErrorCode(name, tokens1[i1].offset, tokens1[i1],
|
|
tokens2[i2], tokens1, tokens2)
|
|
i1 += i + 1
|
|
i2 += 1
|
|
continue
|
|
elif i == 2 and tokens1[i1+i].type == 'ROT_TWO' and tokens2[i2+1].type == 'UNPACK_SEQUENCE_2':
|
|
i1 += 3
|
|
i2 += 2
|
|
continue
|
|
elif i == 2 and tokens1[i1+i].type in BIN_OP_FUNCS:
|
|
f = BIN_OP_FUNCS[tokens1[i1+i].type]
|
|
if f(tokens1[i1].pattr, tokens1[i1+1].pattr) == tokens2[i2].pattr:
|
|
i1 += 3
|
|
i2 += 1
|
|
continue
|
|
elif tokens1[i1].type == 'UNARY_NOT':
|
|
if tokens2[i2].type == 'POP_JUMP_IF_TRUE':
|
|
if tokens1[i1+1].type == 'POP_JUMP_IF_FALSE':
|
|
i1 += 2
|
|
i2 += 1
|
|
continue
|
|
elif tokens2[i2].type == 'POP_JUMP_IF_FALSE':
|
|
if tokens1[i1+1].type == 'POP_JUMP_IF_TRUE':
|
|
i1 += 2
|
|
i2 += 1
|
|
continue
|
|
elif tokens1[i1].type in ('JUMP_FORWARD', 'JUMP_BACK') \
|
|
and tokens1[i1-1].type == 'RETURN_VALUE' \
|
|
and tokens2[i2-1].type in ('RETURN_VALUE', 'RETURN_END_IF') \
|
|
and int(tokens1[i1].offset) not in targets1:
|
|
i1 += 1
|
|
continue
|
|
elif tokens1[i1].type == 'JUMP_FORWARD' and tokens2[i2].type == 'JUMP_BACK' \
|
|
and tokens1[i1+1].type == 'JUMP_BACK' and tokens2[i2+1].type == 'JUMP_BACK' \
|
|
and int(tokens1[i1].pattr) == int(tokens1[i1].offset) + 3:
|
|
if int(tokens1[i1].pattr) == int(tokens1[i1+1].offset):
|
|
i1 += 2
|
|
i2 += 2
|
|
continue
|
|
|
|
raise CmpErrorCode(name, tokens1[i1].offset, tokens1[i1],
|
|
tokens2[i2], tokens1, tokens2)
|
|
elif tokens1[i1].type in JUMP_OPs and tokens1[i1].pattr != tokens2[i2].pattr:
|
|
dest1 = int(tokens1[i1].pattr)
|
|
dest2 = int(tokens2[i2].pattr)
|
|
if tokens1[i1].type == 'JUMP_BACK':
|
|
if offset_map[dest1] != dest2:
|
|
raise CmpErrorCode(name, tokens1[i1].offset, tokens1[i1],
|
|
tokens2[i2], tokens1, tokens2)
|
|
else:
|
|
# import pdb; pdb.set_trace()
|
|
if dest1 in check_jumps:
|
|
check_jumps[dest1].append((i1, i2, dest2))
|
|
else:
|
|
check_jumps[dest1] = [(i1, i2, dest2)]
|
|
|
|
i1 += 1
|
|
i2 += 1
|
|
del tokens1, tokens2 # save memory
|
|
elif member == 'co_consts':
|
|
# partial optimization can make the co_consts look different,
|
|
# so we'll just compare the code consts
|
|
codes1 = ( c for c in code_obj1.co_consts if hasattr(c, 'co_consts') )
|
|
codes2 = ( c for c in code_obj2.co_consts if hasattr(c, 'co_consts') )
|
|
|
|
for c1, c2 in zip(codes1, codes2):
|
|
cmp_code_objects(version, is_pypy, c1, c2, name=name)
|
|
elif member == 'co_flags':
|
|
flags1 = code_obj1.co_flags
|
|
flags2 = code_obj2.co_flags
|
|
if is_pypy:
|
|
# For PYPY for now we don't care about PYPY_SOURCE_IS_UTF8:
|
|
flags2 &= ~0x0100 # PYPY_SOURCE_IS_UTF8
|
|
if flags1 != flags2:
|
|
raise CmpErrorMember(name, 'co_flags',
|
|
pretty_flags(flags1),
|
|
pretty_flags(flags2))
|
|
else:
|
|
# all other members must be equal
|
|
if getattr(code_obj1, member) != getattr(code_obj2, member):
|
|
raise CmpErrorMember(name, member,
|
|
getattr(code_obj1, member),
|
|
getattr(code_obj2, member))
|
|
|
|
class Token(scanner.Token):
|
|
"""Token class with changed semantics for 'cmp()'."""
|
|
def __cmp__(self, o):
|
|
t = self.type # shortcut
|
|
if t == 'BUILD_TUPLE_0' and o.type == 'LOAD_CONST' and o.pattr == ():
|
|
return 0
|
|
if t == 'COME_FROM' == o.type:
|
|
return 0
|
|
if t == 'PRINT_ITEM_CONT' and o.type == 'PRINT_ITEM':
|
|
return 0
|
|
if t == 'RETURN_VALUE' and o.type == 'RETURN_END_IF':
|
|
return 0
|
|
if t == 'JUMP_IF_FALSE_OR_POP' and o.type == 'POP_JUMP_IF_FALSE':
|
|
return 0
|
|
if JUMP_OPs and t in JUMP_OPs:
|
|
# ignore offset
|
|
return t == o.type
|
|
return (t == o.type) or self.pattr == o.pattr
|
|
|
|
def __repr__(self):
|
|
return '%s %s (%s)' % (str(self.type), str(self.attr),
|
|
repr(self.pattr))
|
|
|
|
def __str__(self):
|
|
return '%s\t%-17s %r' % (self.offset, self.type, self.pattr)
|
|
|
|
def compare_code_with_srcfile(pyc_filename, src_filename, weak_verify=False):
|
|
"""Compare a .pyc with a source code file."""
|
|
version, timestamp, magic_int, code_obj1, is_pypy = load_module(pyc_filename)
|
|
if magic_int != PYTHON_MAGIC_INT:
|
|
msg = ("Can't compare code - Python is running with magic %s, but code is magic %s "
|
|
% (PYTHON_MAGIC_INT, magic_int))
|
|
return msg
|
|
code_obj2 = load_file(src_filename)
|
|
cmp_code_objects(version, is_pypy, code_obj1, code_obj2, ignore_code=weak_verify)
|
|
return None
|
|
|
|
def compare_files(pyc_filename1, pyc_filename2, weak_verify=False):
|
|
"""Compare two .pyc files."""
|
|
version, timestamp, magic_int1, code_obj1, is_pypy = uncompyle6.load_module(pyc_filename1)
|
|
version, timestamp, magic_int2, code_obj2, is_pypy = uncompyle6.load_module(pyc_filename2)
|
|
cmp_code_objects(version, is_pypy, code_obj1, code_obj2, ignore_code=weak_verify)
|
|
|
|
if __name__ == '__main__':
|
|
t1 = Token('LOAD_CONST', None, 'code_object _expandLang', 52)
|
|
t2 = Token('LOAD_CONST', -421, 'code_object _expandLang', 55)
|
|
print(repr(t1))
|
|
print(repr(t2))
|
|
print(t1.type == t2.type, t1.attr == t2.attr)
|