Skip to content

Commit 587937e

Browse files
author
p12
committed
Transform: make build_link_map.py reusable
1 parent a6765eb commit 587937e

File tree

1 file changed

+50
-35
lines changed

1 file changed

+50
-35
lines changed

build_link_map.py

Lines changed: 50 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#!/usr/bin/env python
1+
#!/usr/bin/env python3
22

33
# Copyright (C) 2012 p12 <tir5c3@yahoo.co.uk>
44
#
@@ -25,37 +25,52 @@
2525
import re
2626
import os
2727

28-
# find all html files
29-
html_files = []
30-
for root, dirnames, filenames in os.walk('output'):
31-
for filename in fnmatch.filter(filenames, '*.html'):
32-
html_files.append(os.path.join(root, filename))
33-
34-
# create an xml file containing mapping between page title and actual location
35-
root = e.Element('files')
36-
37-
for fn in html_files:
38-
f = open(fn, "r")
39-
text = f.read()
40-
f.close()
41-
42-
m = re.search('<script>[^<]*mw\.config\.set([^<]*wgPageName[^<]*)</script>', text)
43-
if not m:
44-
continue
45-
text = m.group(1)
46-
text = re.sub('\s*', '', text)
47-
m = re.search('"wgPageName":"([^"]*)"', text)
48-
if not m:
49-
continue
50-
51-
title = m.group(1)
52-
53-
target = os.path.relpath(os.path.abspath(fn), os.path.abspath('output'))
54-
file_el = e.SubElement(root, 'file')
55-
file_el.set('from', title)
56-
file_el.set('to', target)
57-
58-
out = open('link-map.xml', 'w')
59-
out.write('<?xml version="1.0" encoding="UTF-8"?>')
60-
out.write(e.tostring(root, pretty_print=True))
61-
out.close()
28+
# returns a dict { title -> filename }.
29+
# directory - either 'output' or 'reference'
30+
def build_link_map(directory):
31+
# find all html files
32+
html_files = []
33+
for root, dirnames, filenames in os.walk(directory):
34+
for filename in fnmatch.filter(filenames, '*.html'):
35+
html_files.append(os.path.join(root, filename))
36+
37+
link_map = {}
38+
39+
for fn in html_files:
40+
f = open(fn, "r")
41+
text = f.read()
42+
f.close()
43+
44+
m = re.search('<script>[^<]*mw\.config\.set([^<]*wgPageName[^<]*)</script>', text)
45+
if not m:
46+
continue
47+
text = m.group(1)
48+
text = re.sub('\s*', '', text)
49+
m = re.search('"wgPageName":"([^"]*)"', text)
50+
if not m:
51+
continue
52+
53+
title = m.group(1)
54+
55+
target = os.path.relpath(os.path.abspath(fn), os.path.abspath(directory))
56+
link_map[title] = target
57+
return link_map
58+
59+
def main():
60+
link_map = build_link_map('output')
61+
62+
# create an xml file containing mapping between page title and actual location
63+
root = e.Element('files')
64+
65+
for key in link_map:
66+
file_el = e.SubElement(root, 'file')
67+
file_el.set('from', key)
68+
file_el.set('to', link_map[key])
69+
70+
out = open('link-map.xml', 'w')
71+
out.write('<?xml version="1.0" encoding="UTF-8"?>')
72+
out.write(e.tostring(root, encoding=str, pretty_print=True))
73+
out.close()
74+
75+
if __name__ == "__main__":
76+
main()

0 commit comments

Comments
 (0)