-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathso_xml_to_tsv.py
More file actions
158 lines (115 loc) · 4.52 KB
/
so_xml_to_tsv.py
File metadata and controls
158 lines (115 loc) · 4.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# This code is supporting material for the book
# Building Machine Learning Systems with Python
# by Willi Richert and Luis Pedro Coelho
# published by PACKT Publishing
#
# It is made available under the MIT License
#
# This script filters the posts and keeps those posts that are or belong
# to a question that has been asked in 2011 or 2012.
#
import os
import re
try:
import ujson as json # UltraJSON if available
except:
import json
from dateutil import parser as dateparser
from operator import itemgetter
from xml.etree import cElementTree as etree
from collections import defaultdict
from data import DATA_DIR
filename = os.path.join(DATA_DIR, "posts-2011-12.xml")
filename_filtered = os.path.join(DATA_DIR, "filtered.tsv")
q_creation = {} # creation datetimes of questions
q_accepted = {} # id of accepted answer
meta = defaultdict(
list) # question -> [(answer Id, IsAccepted, TimeToAnswer, Score), ...]
# regegx to find code snippets
code_match = re.compile('<pre>(.*?)</pre>', re.MULTILINE | re.DOTALL)
link_match = re.compile(
'<a href="http://.*?".*?>(.*?)</a>', re.MULTILINE | re.DOTALL)
img_match = re.compile('<img(.*?)/>', re.MULTILINE | re.DOTALL)
tag_match = re.compile('<[^>]*>', re.MULTILINE | re.DOTALL)
def filter_html(s):
num_code_lines = 0
link_count_in_code = 0
code_free_s = s
num_images = len(img_match.findall(s))
# remove source code and count how many lines
for match_str in code_match.findall(s):
num_code_lines += match_str.count('\n')
code_free_s = code_match.sub("", code_free_s)
# sometimes source code contain links, which we don't want to count
link_count_in_code += len(link_match.findall(match_str))
anchors = link_match.findall(s)
link_count = len(anchors)
link_count -= link_count_in_code
html_free_s = re.sub(
" +", " ", tag_match.sub('', code_free_s)).replace("\n", "")
link_free_s = html_free_s
for anchor in anchors:
if anchor.lower().startswith("http://"):
link_free_s = link_free_s.replace(anchor, '')
num_text_tokens = html_free_s.count(" ")
return link_free_s, num_text_tokens, num_code_lines, link_count, num_images
years = defaultdict(int)
num_questions = 0
num_answers = 0
def parsexml(filename):
global num_questions, num_answers
counter = 0
it = map(itemgetter(1),
iter(etree.iterparse(filename, events=('start',))))
root = next(it) # get posts element
for elem in it:
if counter % 100000 == 0:
print(counter)
counter += 1
if elem.tag == 'row':
creation_date = dateparser.parse(elem.get('CreationDate'))
Id = int(elem.get('Id'))
PostTypeId = int(elem.get('PostTypeId'))
Score = int(elem.get('Score'))
if PostTypeId == 1:
num_questions += 1
years[creation_date.year] += 1
ParentId = -1
TimeToAnswer = 0
q_creation[Id] = creation_date
accepted = elem.get('AcceptedAnswerId')
if accepted:
q_accepted[Id] = int(accepted)
IsAccepted = 0
elif PostTypeId == 2:
num_answers += 1
ParentId = int(elem.get('ParentId'))
if not ParentId in q_creation:
# question was too far in the past
continue
TimeToAnswer = (creation_date - q_creation[ParentId]).seconds
if ParentId in q_accepted:
IsAccepted = int(q_accepted[ParentId] == Id)
else:
IsAccepted = 0
meta[ParentId].append((Id, IsAccepted, TimeToAnswer, Score))
else:
continue
Text, NumTextTokens, NumCodeLines, LinkCount, NumImages = filter_html(
elem.get('Body'))
values = (Id, ParentId,
IsAccepted,
TimeToAnswer, Score,
Text,
NumTextTokens, NumCodeLines, LinkCount, NumImages)
yield values
root.clear() # preserve memory
with open(os.path.join(DATA_DIR, filename_filtered), "w") as f:
for item in parsexml(filename):
line = "\t".join(map(str, item))
f.write(line.encode("utf-8") + "\n")
with open(os.path.join(DATA_DIR, "filtered-meta.json"), "w") as f:
json.dump(meta, f)
print("years:", years)
print("#qestions: %i" % num_questions)
print("#answers: %i" % num_answers)