|
1 | | -#!/usr/bin/env python |
| 1 | +#!/usr/bin/env python3 |
2 | 2 |
|
3 | 3 | # Copyright (C) 2012 p12 <tir5c3@yahoo.co.uk> |
4 | 4 | # |
|
25 | 25 | import re |
26 | 26 | import os |
27 | 27 |
|
28 | | -# find all html files |
29 | | -html_files = [] |
30 | | -for root, dirnames, filenames in os.walk('output'): |
31 | | - for filename in fnmatch.filter(filenames, '*.html'): |
32 | | - html_files.append(os.path.join(root, filename)) |
33 | | - |
34 | | -# create an xml file containing mapping between page title and actual location |
35 | | -root = e.Element('files') |
36 | | - |
37 | | -for fn in html_files: |
38 | | - f = open(fn, "r") |
39 | | - text = f.read() |
40 | | - f.close() |
41 | | - |
42 | | - m = re.search('<script>[^<]*mw\.config\.set([^<]*wgPageName[^<]*)</script>', text) |
43 | | - if not m: |
44 | | - continue |
45 | | - text = m.group(1) |
46 | | - text = re.sub('\s*', '', text) |
47 | | - m = re.search('"wgPageName":"([^"]*)"', text) |
48 | | - if not m: |
49 | | - continue |
50 | | - |
51 | | - title = m.group(1) |
52 | | - |
53 | | - target = os.path.relpath(os.path.abspath(fn), os.path.abspath('output')) |
54 | | - file_el = e.SubElement(root, 'file') |
55 | | - file_el.set('from', title) |
56 | | - file_el.set('to', target) |
57 | | - |
58 | | -out = open('link-map.xml', 'w') |
59 | | -out.write('<?xml version="1.0" encoding="UTF-8"?>') |
60 | | -out.write(e.tostring(root, pretty_print=True)) |
61 | | -out.close() |
| 28 | +# returns a dict { title -> filename }. |
| 29 | +# directory - either 'output' or 'reference' |
| 30 | +def build_link_map(directory): |
| 31 | + # find all html files |
| 32 | + html_files = [] |
| 33 | + for root, dirnames, filenames in os.walk(directory): |
| 34 | + for filename in fnmatch.filter(filenames, '*.html'): |
| 35 | + html_files.append(os.path.join(root, filename)) |
| 36 | + |
| 37 | + link_map = {} |
| 38 | + |
| 39 | + for fn in html_files: |
| 40 | + f = open(fn, "r") |
| 41 | + text = f.read() |
| 42 | + f.close() |
| 43 | + |
| 44 | + m = re.search('<script>[^<]*mw\.config\.set([^<]*wgPageName[^<]*)</script>', text) |
| 45 | + if not m: |
| 46 | + continue |
| 47 | + text = m.group(1) |
| 48 | + text = re.sub('\s*', '', text) |
| 49 | + m = re.search('"wgPageName":"([^"]*)"', text) |
| 50 | + if not m: |
| 51 | + continue |
| 52 | + |
| 53 | + title = m.group(1) |
| 54 | + |
| 55 | + target = os.path.relpath(os.path.abspath(fn), os.path.abspath(directory)) |
| 56 | + link_map[title] = target |
| 57 | + return link_map |
| 58 | + |
| 59 | +def main(): |
| 60 | + link_map = build_link_map('output') |
| 61 | + |
| 62 | + # create an xml file containing mapping between page title and actual location |
| 63 | + root = e.Element('files') |
| 64 | + |
| 65 | + for key in link_map: |
| 66 | + file_el = e.SubElement(root, 'file') |
| 67 | + file_el.set('from', key) |
| 68 | + file_el.set('to', link_map[key]) |
| 69 | + |
| 70 | + out = open('link-map.xml', 'w') |
| 71 | + out.write('<?xml version="1.0" encoding="UTF-8"?>') |
| 72 | + out.write(e.tostring(root, encoding=str, pretty_print=True)) |
| 73 | + out.close() |
| 74 | + |
| 75 | +if __name__ == "__main__": |
| 76 | + main() |
0 commit comments