-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Expand file tree
/
Copy pathlexical.py
More file actions
153 lines (134 loc) · 5.92 KB
/
lexical.py
File metadata and controls
153 lines (134 loc) · 5.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import ast
import sys
import math
from semmle.python.passes.ast_pass import iter_fields
from semmle.python import ast
from semmle.python.passes._pass import Pass
__all__ = [ 'LexicalPass' ]
STMT_OR_EXPR = ast.expr, ast.stmt
LOCATABLE = STMT_OR_EXPR + (ast.pattern, ast.comprehension, ast.StringPart, ast.keyword, ast.KeyValuePair, ast.DictUnpacking, ast.type_parameter)
CLASS_OR_FUNCTION = ast.Class, ast.Function
SCOPES = ast.Class, ast.Function, ast.Module
class LexicalPass(Pass):
def extract(self, ast, comments, writer):
'The entry point'
LexicalModule(ast, comments, writer).extract()
class LexicalModule(object):
'Object for extracting lexical information for the given module.'
def __init__(self, ast, comments, writer):
assert ast is not None and comments is not None
self.ast = ast
self.comments = comments
self.writer = writer
self.module_id = writer.get_node_id(ast)
def extract(self):
loc_id = self.get_location(0, 0, 0, 0)
self.writer.write_tuple(u'py_scope_location', 'rr', loc_id, self.module_id)
self.emit_line_info()
self.emit_locations(self.ast)
def emit_line_info(self):
for text, start, end in self.comments:
#Generate a unique string for comment based on location
comment_id = str(start + end)
loc_id = self.get_location(start[0], start[1]+1,
end[0], end[1])
try:
self.writer.write_tuple(u'py_comments', 'nsr',
comment_id, text, loc_id)
except UnicodeDecodeError:
# Handle non-ascii comments. Should only happen in Py2
assert sys.hexversion < 0x03000000
text = text.decode("latin8")
self.writer.write_tuple(u'py_comments', 'nsr',
comment_id, text, loc_id)
comment_bits = get_comment_bits(self.comments)
self.emit_line_counts(self.ast, set(), comment_bits)
def emit_line_counts(self, node, code_lines, comment_bits):
if isinstance(node, SCOPES) and node.body:
doc_line_count = 0
stmt0 = node.body[0]
if type(stmt0) == ast.Expr:
docstring = stmt0.value
if isinstance(docstring, ast.Str):
doc_line_count = docstring._end[0] - docstring.lineno + 1
inner_code_lines = set()
inner_code_lines.add(node.lineno)
for _, _, child_node in iter_fields(node):
self.emit_line_counts(child_node, inner_code_lines, comment_bits)
assert inner_code_lines
startline = min(inner_code_lines)
endline = max(inner_code_lines)
if isinstance(node, ast.Module):
endline = max(endline, last_line(comment_bits))
comment_line_count = get_lines_in_range(comment_bits, startline, endline)
code_line_count = len(inner_code_lines) - doc_line_count
code_lines.update(inner_code_lines)
self.print_lines(u'code', node, code_line_count)
self.print_lines(u'comment', node, comment_line_count)
self.print_lines(u'docstring', node, doc_line_count)
self.print_lines(u'all', node, endline - startline + 1)
if isinstance(node, ast.Module):
total_lines = code_line_count + comment_line_count + doc_line_count
self.writer.write_tuple(u'numlines', 'rddd', self.module_id, total_lines, code_line_count, comment_line_count + doc_line_count)
elif isinstance(node, list):
for n in node:
self.emit_line_counts(n, code_lines, comment_bits)
elif isinstance(node, STMT_OR_EXPR):
for _, _, child_node in iter_fields(node):
self.emit_line_counts(child_node, code_lines, comment_bits)
assert hasattr(node, "lineno"), node
line = node.lineno
endline, _ = node._end
while line <= endline:
code_lines.add(line)
line += 1
def print_lines(self, name, node, count):
self.writer.write_tuple(u'py_%slines' % name, 'nd', node, count)
def get_location(self, bl, bc, el, ec):
loc_id = self.writer.get_unique_id()
self.writer.write_tuple(u'locations_ast', 'rrdddd',
loc_id, self.module_id, bl, bc, el, ec)
return loc_id
def emit_locations(self, node):
if isinstance(node, ast.AstBase):
if isinstance(node, LOCATABLE):
self._write_location(node)
elif isinstance(node, CLASS_OR_FUNCTION):
bl, bc = node.lineno, node.col_offset+1
el, ec = node._end
loc_id = self.get_location(bl, bc, el, ec)
self.writer.write_tuple(u'py_scope_location', 'rn', loc_id, node)
for _, _, child_node in iter_fields(node):
self.emit_locations(child_node)
elif isinstance(node, list):
for n in node:
self.emit_locations(n)
def _write_location(self, node):
bl, bc = node.lineno, node.col_offset+1
assert len(node._end) == 2, node
el, ec = node._end
loc_id = self.get_location(bl, bc, el, ec)
self.writer.write_tuple(u'py_locations', 'rn', loc_id, node)
def get_comment_bits(comments):
comment_bits = 0
for _, start, end in comments:
line, _ = start
end_line, _ = end
while line <= end_line:
comment_bits |= (1<<line)
line += 1
return comment_bits
def get_lines_in_range(bits, start, end):
if end >= 0:
length = end - start + 1
if length < 0:
return 0
section = bits >> start
section &= (1 << length) - 1
else:
section = bits >> start
return bin(section).count('1')
def last_line(n):
if n <= 0:
return 0
return int(math.log(n, 2))