forked from svaarala/duktape
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscan_strings.py
More file actions
135 lines (112 loc) · 3.76 KB
/
scan_strings.py
File metadata and controls
135 lines (112 loc) · 3.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/env python2
#
# Scan potential external strings from ECMAScript and C files.
#
# Very simplistic example with a lot of limitations:
#
# - Doesn't handle multiple variables in a variable declaration
#
# - Only extracts strings from C files, these may correspond to
# Duktape/C bindings (but in many cases don't)
#
import os
import sys
import re
import json
strmap = {}
# ECMAScript function declaration
re_funcname = re.compile(r'function\s+(\w+)', re.UNICODE)
# ECMAScript variable declaration
# XXX: doesn't handle multiple variables
re_vardecl = re.compile(r'var\s+(\w+)', re.UNICODE)
# ECMAScript variable assignment
re_varassign = re.compile(r'(\w+)\s*=\s*', re.UNICODE)
# ECMAScript dotted property reference (also matches numbers like
# '4.0', which are separately rejected below)
re_propref = re.compile(r'(\w+(?:\.\w+)+)', re.UNICODE)
re_digits = re.compile(r'^\d+$', re.UNICODE)
# ECMAScript or C string literal
re_strlit_dquot = re.compile(r'("(?:\\"|\\\\|[^"])*")', re.UNICODE)
re_strlit_squot = re.compile(r'(\'(?:\\\'|\\\\|[^\'])*\')', re.UNICODE)
def strDecode(x):
# Need to decode hex, unicode, and other escapes. Python syntax
# is close enough to C and ECMAScript so use eval for now.
try:
return eval('u' + x) # interpret as unicode string
except:
sys.stderr.write('Failed to parse: ' + repr(x) + ', ignoring\n')
return None
def scan(f, fn):
global strmap
# Scan rules depend on file type
if fn[-2:] == '.c':
use_funcname = False
use_vardecl = False
use_varassign = False
use_propref = False
use_strlit_dquot = True
use_strlit_squot = False
else:
use_funcname = True
use_vardecl = True
use_varassign = True
use_propref = True
use_strlit_dquot = True
use_strlit_squot = True
for line in f:
# Assume input data is UTF-8
line = line.decode('utf-8')
if use_funcname:
for m in re_funcname.finditer(line):
strmap[m.group(1)] = True
if use_vardecl:
for m in re_vardecl.finditer(line):
strmap[m.group(1)] = True
if use_varassign:
for m in re_varassign.finditer(line):
strmap[m.group(1)] = True
if use_propref:
for m in re_propref.finditer(line):
parts = m.group(1).split('.')
if re_digits.match(parts[0]) is not None:
# Probably a number ('4.0' or such)
pass
else:
for part in parts:
strmap[part] = True
if use_strlit_dquot:
for m in re_strlit_dquot.finditer(line):
s = strDecode(m.group(1))
if s is not None:
strmap[s] = True
if use_strlit_squot:
for m in re_strlit_squot.finditer(line):
s = strDecode(m.group(1))
if s is not None:
strmap[s] = True
def main():
for fn in sys.argv[1:]:
f = open(fn, 'rb')
scan(f, fn)
f.close()
strs = []
strs_base64 = []
doc = {
# Strings as Unicode strings
'scanned_strings': strs,
# Strings as base64-encoded UTF-8 data, which should be ready
# to be used in C code (Duktape internal string representation
# is UTF-8)
'scanned_strings_base64': strs_base64
}
k = strmap.keys()
k.sort()
for s in k:
strs.append(s)
t = s.encode('utf-8').encode('base64')
if len(t) > 0 and t[-1] == '\n':
t = t[0:-1]
strs_base64.append(t)
print(json.dumps(doc, indent=4, ensure_ascii=True, sort_keys=True))
if __name__ == '__main__':
main()