BuildingMachineLearningSystemsWithPython/ch05/so_xml_to_tsv.py at master · orban/BuildingMachineLearningSystemsWithPython

History

158 lines (115 loc) · 4.52 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

# This code is supporting material for the book

# Building Machine Learning Systems with Python

# by Willi Richert and Luis Pedro Coelho

# published by PACKT Publishing

# It is made available under the MIT License

# This script filters the posts and keeps those posts that are or belong

# to a question that has been asked in 2011 or 2012.

import os

import re

try:

import ujson as json # UltraJSON if available

except:

import json

from dateutil import parser as dateparser

from operator import itemgetter

from xml.etree import cElementTree as etree

from collections import defaultdict

from data import DATA_DIR

filename = os.path.join(DATA_DIR, "posts-2011-12.xml")

filename_filtered = os.path.join(DATA_DIR, "filtered.tsv")

q_creation = {} # creation datetimes of questions

q_accepted = {} # id of accepted answer

meta = defaultdict(

list) # question -> [(answer Id, IsAccepted, TimeToAnswer, Score), ...]

# regegx to find code snippets

code_match = re.compile('<pre>(.*?)</pre>', re.MULTILINE | re.DOTALL)

link_match = re.compile(

'<a href="http://.*?".*?>(.*?)</a>', re.MULTILINE | re.DOTALL)

img_match = re.compile('<img(.*?)/>', re.MULTILINE | re.DOTALL)

tag_match = re.compile('<[^>]*>', re.MULTILINE | re.DOTALL)

def filter_html(s):

num_code_lines = 0

link_count_in_code = 0

code_free_s = s

num_images = len(img_match.findall(s))

# remove source code and count how many lines

for match_str in code_match.findall(s):

num_code_lines += match_str.count('\n')

code_free_s = code_match.sub("", code_free_s)

# sometimes source code contain links, which we don't want to count

link_count_in_code += len(link_match.findall(match_str))

anchors = link_match.findall(s)

link_count = len(anchors)

link_count -= link_count_in_code

html_free_s = re.sub(

" +", " ", tag_match.sub('', code_free_s)).replace("\n", "")

link_free_s = html_free_s

for anchor in anchors:

if anchor.lower().startswith("http://"):

link_free_s = link_free_s.replace(anchor, '')

num_text_tokens = html_free_s.count(" ")

return link_free_s, num_text_tokens, num_code_lines, link_count, num_images

years = defaultdict(int)

num_questions = 0

num_answers = 0

def parsexml(filename):

global num_questions, num_answers

counter = 0

it = map(itemgetter(1),

iter(etree.iterparse(filename, events=('start',))))

root = next(it) # get posts element

for elem in it:

if counter % 100000 == 0:

print(counter)

counter += 1

if elem.tag == 'row':

creation_date = dateparser.parse(elem.get('CreationDate'))

Id = int(elem.get('Id'))

PostTypeId = int(elem.get('PostTypeId'))

Score = int(elem.get('Score'))

if PostTypeId == 1:

num_questions += 1

years[creation_date.year] += 1

ParentId = -1

TimeToAnswer = 0

q_creation[Id] = creation_date

accepted = elem.get('AcceptedAnswerId')

if accepted:

q_accepted[Id] = int(accepted)

IsAccepted = 0

elif PostTypeId == 2:

num_answers += 1

ParentId = int(elem.get('ParentId'))

if not ParentId in q_creation:

# question was too far in the past

continue

TimeToAnswer = (creation_date - q_creation[ParentId]).seconds

if ParentId in q_accepted:

IsAccepted = int(q_accepted[ParentId] == Id)

else:

IsAccepted = 0

meta[ParentId].append((Id, IsAccepted, TimeToAnswer, Score))

else:

continue

Text, NumTextTokens, NumCodeLines, LinkCount, NumImages = filter_html(

elem.get('Body'))

values = (Id, ParentId,

IsAccepted,

TimeToAnswer, Score,

Text,

NumTextTokens, NumCodeLines, LinkCount, NumImages)

yield values

root.clear() # preserve memory

with open(os.path.join(DATA_DIR, filename_filtered), "w") as f:

for item in parsexml(filename):

line = "\t".join(map(str, item))

f.write(line.encode("utf-8") + "\n")

with open(os.path.join(DATA_DIR, "filtered-meta.json"), "w") as f:

json.dump(meta, f)

print("years:", years)

print("#qestions: %i" % num_questions)

print("#answers: %i" % num_answers)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

so_xml_to_tsv.py

Latest commit

History

so_xml_to_tsv.py

File metadata and controls