codeql/python/extractor/semmle/python/passes/lexical.py at main · github/codeql

History

153 lines (134 loc) · 5.92 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

import ast

import sys

import math

from semmle.python.passes.ast_pass import iter_fields

from semmle.python import ast

from semmle.python.passes._pass import Pass

__all__ = [ 'LexicalPass' ]

STMT_OR_EXPR = ast.expr, ast.stmt

LOCATABLE = STMT_OR_EXPR + (ast.pattern, ast.comprehension, ast.StringPart, ast.keyword, ast.KeyValuePair, ast.DictUnpacking, ast.type_parameter)

CLASS_OR_FUNCTION = ast.Class, ast.Function

SCOPES = ast.Class, ast.Function, ast.Module

class LexicalPass(Pass):

def extract(self, ast, comments, writer):

'The entry point'

LexicalModule(ast, comments, writer).extract()

class LexicalModule(object):

'Object for extracting lexical information for the given module.'

def __init__(self, ast, comments, writer):

assert ast is not None and comments is not None

self.ast = ast

self.comments = comments

self.writer = writer

self.module_id = writer.get_node_id(ast)

def extract(self):

loc_id = self.get_location(0, 0, 0, 0)

self.writer.write_tuple(u'py_scope_location', 'rr', loc_id, self.module_id)

self.emit_line_info()

self.emit_locations(self.ast)

def emit_line_info(self):

for text, start, end in self.comments:

#Generate a unique string for comment based on location

comment_id = str(start + end)

loc_id = self.get_location(start[0], start[1]+1,

end[0], end[1])

try:

self.writer.write_tuple(u'py_comments', 'nsr',

comment_id, text, loc_id)

except UnicodeDecodeError:

# Handle non-ascii comments. Should only happen in Py2

assert sys.hexversion < 0x03000000

text = text.decode("latin8")

self.writer.write_tuple(u'py_comments', 'nsr',

comment_id, text, loc_id)

comment_bits = get_comment_bits(self.comments)

self.emit_line_counts(self.ast, set(), comment_bits)

def emit_line_counts(self, node, code_lines, comment_bits):

if isinstance(node, SCOPES) and node.body:

doc_line_count = 0

stmt0 = node.body[0]

if type(stmt0) == ast.Expr:

docstring = stmt0.value

if isinstance(docstring, ast.Str):

doc_line_count = docstring._end[0] - docstring.lineno + 1

inner_code_lines = set()

inner_code_lines.add(node.lineno)

for _, _, child_node in iter_fields(node):

self.emit_line_counts(child_node, inner_code_lines, comment_bits)

assert inner_code_lines

startline = min(inner_code_lines)

endline = max(inner_code_lines)

if isinstance(node, ast.Module):

endline = max(endline, last_line(comment_bits))

comment_line_count = get_lines_in_range(comment_bits, startline, endline)

code_line_count = len(inner_code_lines) - doc_line_count

code_lines.update(inner_code_lines)

self.print_lines(u'code', node, code_line_count)

self.print_lines(u'comment', node, comment_line_count)

self.print_lines(u'docstring', node, doc_line_count)

self.print_lines(u'all', node, endline - startline + 1)

if isinstance(node, ast.Module):

total_lines = code_line_count + comment_line_count + doc_line_count

self.writer.write_tuple(u'numlines', 'rddd', self.module_id, total_lines, code_line_count, comment_line_count + doc_line_count)

elif isinstance(node, list):

for n in node:

self.emit_line_counts(n, code_lines, comment_bits)

elif isinstance(node, STMT_OR_EXPR):

for _, _, child_node in iter_fields(node):

self.emit_line_counts(child_node, code_lines, comment_bits)

assert hasattr(node, "lineno"), node

line = node.lineno

endline, _ = node._end

while line <= endline:

code_lines.add(line)

line += 1

def print_lines(self, name, node, count):

self.writer.write_tuple(u'py_%slines' % name, 'nd', node, count)

def get_location(self, bl, bc, el, ec):

loc_id = self.writer.get_unique_id()

self.writer.write_tuple(u'locations_ast', 'rrdddd',

loc_id, self.module_id, bl, bc, el, ec)

return loc_id

def emit_locations(self, node):

if isinstance(node, ast.AstBase):

if isinstance(node, LOCATABLE):

self._write_location(node)

elif isinstance(node, CLASS_OR_FUNCTION):

bl, bc = node.lineno, node.col_offset+1

el, ec = node._end

loc_id = self.get_location(bl, bc, el, ec)

self.writer.write_tuple(u'py_scope_location', 'rn', loc_id, node)

for _, _, child_node in iter_fields(node):

self.emit_locations(child_node)

elif isinstance(node, list):

for n in node:

self.emit_locations(n)

def _write_location(self, node):

bl, bc = node.lineno, node.col_offset+1

assert len(node._end) == 2, node

el, ec = node._end

loc_id = self.get_location(bl, bc, el, ec)

self.writer.write_tuple(u'py_locations', 'rn', loc_id, node)

def get_comment_bits(comments):

comment_bits = 0

for _, start, end in comments:

line, _ = start

end_line, _ = end

while line <= end_line:

comment_bits |= (1<<line)

line += 1

return comment_bits

def get_lines_in_range(bits, start, end):

if end >= 0:

length = end - start + 1

if length < 0:

return 0

section = bits >> start

section &= (1 << length) - 1

else:

section = bits >> start

return bin(section).count('1')

def last_line(n):

if n <= 0:

return 0

return int(math.log(n, 2))

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

lexical.py

Latest commit

History

lexical.py

File metadata and controls