Skip to content

Commit 8974476

Browse files
author
p12
committed
Add script to download several images forgotten by httrack
1 parent 22c8ae7 commit 8974476

File tree

2 files changed

+73
-0
lines changed

2 files changed

+73
-0
lines changed

Makefile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,3 +150,6 @@ source:
150150
rm -f "reference/hts-log.txt"
151151
rm -f "reference/index.html"
152152

153+
#download files that httrack has forgotten
154+
./httrack-workarounds.py
155+

httrack-workarounds.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
#!/usr/bin/env python
2+
3+
# Copyright (C) 2012 p12 <tir5c3@yahoo.co.uk>
4+
#
5+
# This file is part of cppreference-doc
6+
#
7+
# This program is free software: you can redistribute it and/or modify
8+
# it under the terms of the GNU General Public License as published by
9+
# the Free Software Foundation, either version 3 of the License, or
10+
# (at your option) any later version.
11+
#
12+
# This program is distributed in the hope that it will be useful,
13+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
14+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15+
# GNU General Public License for more details.
16+
#
17+
# You should have received a copy of the GNU General Public License
18+
# along with this program. If not, see http://www.gnu.org/licenses/.
19+
20+
# This program downloads all files that have been forgotten by httrack
21+
# Currently these files are images referred from the CSS files
22+
23+
import fnmatch
24+
import re
25+
import os
26+
27+
# find all css files
28+
css_files = []
29+
for root, dirnames, filenames in os.walk('reference'):
30+
for filename in fnmatch.filter(filenames, '*.css'):
31+
css_files.append(os.path.join(root, filename))
32+
33+
for fn in css_files:
34+
f = open(fn, "r")
35+
text = f.read()
36+
f.close()
37+
38+
p_text = text
39+
40+
# spaces within paths NOT supported
41+
# matches only those URLS starting with http://$(lang).cppreference.com
42+
text = re.sub('\s*', '', text)
43+
matches = re.findall(':url\(([^\'"][^\(]*[^\'"])\)', text)
44+
matches += re.findall(':url\(\'([^\']*)\'\)', text)
45+
matches += re.findall(':url\("([^\']*)"\)', text)
46+
47+
for match in matches:
48+
p_match = match
49+
50+
# strip query string
51+
match = re.sub('\?[^?]*$', '', match)
52+
53+
if (not re.match('https?://', match)):
54+
continue
55+
match = re.sub('^https?://', '', match)
56+
if (not re.match('[^.]*\.cppreference\.com', match)):
57+
continue
58+
59+
start = os.path.abspath(os.path.dirname(fn))
60+
dest = os.path.abspath('reference/' + match)
61+
62+
os.system('rm -f "' + dest + '"')
63+
os.system('wget "' + p_match + '" -O "' + dest + '"')
64+
relpath = os.path.relpath(dest, start)
65+
66+
p_text = p_text.replace(p_match, relpath)
67+
68+
f = open(fn, "w")
69+
f.write(p_text)
70+
f.close()

0 commit comments

Comments
 (0)