Skip to content

Commit dacac63

Browse files
committed
Preprocess: simplify file renaming
This simplifies file renaming by using a map from old to new name. The new rename map doesn't store file paths any more, so the directory tree needs to be walked twice (once for searching, once for renaming), but transforming relative links is a lot simpler because it can use a map lookup instead of trying every entry in the rename map. Transformation of relative links now uses urllib for parsing URLs, so matching and transformation should be more robust for exotic links.
1 parent a0686ef commit dacac63

File tree

3 files changed

+70
-82
lines changed

3 files changed

+70
-82
lines changed

commands/preprocess.py

Lines changed: 47 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -81,15 +81,7 @@ def rearrange_archive(root):
8181
for fn in fnmatch.filter(os.listdir(root), 'cppreference-export*.xml'):
8282
os.remove(os.path.join(root, fn))
8383

84-
def add_file_to_rename_map(rename_map, dir, fn, new_fn):
85-
path = os.path.join(dir, fn)
86-
if not os.path.isfile(path):
87-
print("ERROR: Not renaming '{0}' because path does not exist".format(path))
88-
return
89-
rename_map.append((dir, fn, new_fn))
90-
91-
# Converts complex URL to resources supplied by MediaWiki loader to a simplified
92-
# name
84+
# Converts complex URL to resources supplied by MediaWiki loader to a simplified name
9385
def convert_loader_name(fn):
9486
if "modules=site&only=scripts" in fn:
9587
return "site_scripts.js"
@@ -105,55 +97,36 @@ def convert_loader_name(fn):
10597
raise Exception('Loader file {0} does not match any known files'.format(fn))
10698

10799
def find_files_to_be_renamed(root):
108-
# Returns a rename map: array of tuples each of which contain three strings:
109-
# the directory the file resides in, the source and destination filenames.
110-
111-
# The rename map specifies files to be renamed in order to support them on
112-
# windows filesystems which don't support certain characters in file names
113-
rename_map = []
114-
115-
files_rename = [] # general files to be renamed
116-
files_loader = [] # files served by load.php. These should map to
117-
# consistent and short file names because we
118-
# modify some of them later in the pipeline
119-
120-
for dir, _, filenames in os.walk(root):
121-
filenames_loader = set(fnmatch.filter(filenames, 'load.php[?]*'))
122-
# match any filenames with '?"*' characters
123-
filenames_rename = set(fnmatch.filter(filenames, '*[?"*]*'))
124-
125-
# don't process load.php files in general rename handler
126-
filenames_rename -= filenames_loader
127-
128-
for fn in filenames_loader:
129-
files_loader.append((dir, fn))
130-
for fn in filenames_rename:
131-
files_rename.append((dir, fn))
132-
133-
for dir, orig_fn in files_rename:
134-
fn = orig_fn
135-
fn = re.sub(r'\?.*', '', fn)
136-
fn = fn.replace('"', '_q_')
137-
fn = fn.replace('*', '_star_')
138-
add_file_to_rename_map(rename_map, dir, orig_fn, fn)
139-
140-
# map loader names to more recognizable names
141-
for dir, fn in files_loader:
142-
new_fn = convert_loader_name(fn)
143-
add_file_to_rename_map(rename_map, dir, fn, new_fn)
144-
145-
# rename filenames that conflict on case-insensitive filesystems
146-
# TODO: perform this automatically
147-
add_file_to_rename_map(rename_map, os.path.join(root, 'en/cpp/numeric/math'), 'NAN.html', 'NAN.2.html')
148-
add_file_to_rename_map(rename_map, os.path.join(root, 'en/c/numeric/math'), 'NAN.html', 'NAN.2.html')
149-
return rename_map
150-
151-
def rename_files(rename_map):
152-
for dir, old_fn, new_fn in rename_map:
153-
src_path = os.path.join(dir, old_fn)
154-
dst_path = os.path.join(dir, new_fn)
155-
print("Renaming '{0}' to \n '{1}'".format(src_path, dst_path))
156-
shutil.move(src_path, dst_path)
100+
# Returns a rename map: a map from old to new file name
101+
loader = re.compile(r'load\.php\?.*')
102+
query = re.compile(r'\?.*')
103+
result = dict()
104+
105+
# find files with invalid names -> rename all occurrences
106+
for fn in set(fn for _, _, filenames in os.walk(root) for fn in filenames):
107+
if loader.match(fn):
108+
result[fn] = convert_loader_name(fn)
109+
110+
elif any((c in fn) for c in '?*"'):
111+
new_fn = query.sub('', fn)
112+
new_fn = new_fn.replace('"', '_q_')
113+
new_fn = new_fn.replace('*', '_star_')
114+
result[fn] = new_fn
115+
116+
# rename files that conflict on case-insensitive filesystems
117+
# TODO perform this automatically
118+
result['NAN.html'] = 'NAN.2.html'
119+
120+
return result
121+
122+
def rename_files(root, rename_map):
123+
for dir, old_fn in ((dir, fn) for dir, _, filenames in os.walk(root) for fn in filenames):
124+
new_fn = rename_map.get(old_fn)
125+
if new_fn is not None:
126+
src_path = os.path.join(dir, old_fn)
127+
dst_path = os.path.join(dir, new_fn)
128+
print("Renaming '{0}' to \n '{1}'".format(src_path, dst_path))
129+
shutil.move(src_path, dst_path)
157130

158131
def find_html_files(root):
159132
# find files that need to be preprocessed
@@ -199,26 +172,24 @@ def transform_ranges_placeholder(target, file, root):
199172
return os.path.relpath(abstarget, os.path.dirname(file))
200173

201174
def is_external_link(target):
202-
external_link_patterns = [
203-
'http://',
204-
'https://',
205-
'ftp://'
206-
]
207-
for pattern in external_link_patterns:
208-
if target.startswith(pattern):
209-
return True
210-
return False
175+
url = urllib.parse.urlparse(target)
176+
return url.scheme != '' or url.netloc != ''
211177

212178
def trasform_relative_link(rename_map, target):
213-
target = urllib.parse.unquote(target)
214-
for _, fn, new_fn in rename_map:
215-
target = target.replace(fn, new_fn)
216-
target = target.replace('../../upload.cppreference.com/mwiki/','../common/')
217-
target = target.replace('../mwiki/','../common/')
218-
target = re.sub(r'(\.php|\.css)\?.*', r'\1', target)
219-
target = urllib.parse.quote(target)
220-
target = target.replace('%23', '#')
221-
return target
179+
# urllib.parse tuple is (scheme, host, path, params, query, fragment)
180+
_, _, path, params, _, fragment = urllib.parse.urlparse(target)
181+
assert params == ''
182+
183+
path = urllib.parse.unquote(path)
184+
path = path.replace('../../upload.cppreference.com/mwiki/','../common/')
185+
path = path.replace('../mwiki/','../common/')
186+
187+
dir, fn = os.path.split(path)
188+
fn = rename_map.get(fn, fn)
189+
path = os.path.join(dir, fn)
190+
191+
path = urllib.parse.quote(path)
192+
return urllib.parse.urlunparse(('', '', path, params, '', fragment))
222193

223194
# Transforms a link in the given file according to rename map.
224195
# target is the link to transform.

preprocess.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def main():
3939
preprocess.rearrange_archive(root)
4040

4141
rename_map = preprocess.find_files_to_be_renamed(root)
42-
preprocess.rename_files(rename_map)
42+
preprocess.rename_files(root, rename_map)
4343

4444
# clean the html files
4545
file_list = preprocess.find_html_files(root)

tests/test_preprocess.py

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -90,11 +90,28 @@ def test_has_class(self):
9090

9191
class TestIsExternalLink(unittest.TestCase):
9292
def test_is_external_link(self):
93-
self.assertEqual(True, is_external_link('http://a'))
94-
self.assertEqual(True, is_external_link('https://a'))
95-
self.assertEqual(True, is_external_link('ftp://a'))
96-
self.assertEqual(False, is_external_link('ahttp://a'))
97-
self.assertEqual(False, is_external_link(' http://a'))
93+
external = [
94+
'http://example.com',
95+
'https://example.com',
96+
'ftp://example.com',
97+
'ftps://example.com',
98+
'slack://example.com',
99+
'https:///foo.html', # Not technically external, but we say so anyway
100+
'//example.com'
101+
]
102+
for link in external:
103+
self.assertTrue(is_external_link(link),
104+
msg="Should be external: {}".format(link))
105+
106+
relative = [
107+
'/example.com',
108+
'../foo.html',
109+
'foo.html',
110+
'foo'
111+
]
112+
for link in relative:
113+
self.assertFalse(is_external_link(link),
114+
msg="Should not be external: {}".format(link))
98115

99116
class TestPlaceholderLinks(unittest.TestCase):
100117
# Placeholder link replacement is implemented in the MediaWiki site JS at

0 commit comments

Comments
 (0)