Skip to content

Commit 7742654

Browse files
author
p12
committed
Transform/DDG: add 'that is,' as a synonym of 'i.e.'
1 parent 68b4afc commit 7742654

File tree

1 file changed

+25
-13
lines changed

1 file changed

+25
-13
lines changed

ddg_parse_html.py

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -149,8 +149,9 @@ def process_description(el):
149149
# description must never contain newlines
150150
desc = desc.replace('\n',' ')
151151

152-
# Handle 'i.e.' as a special case
152+
# Handle 'i.e.' and 'that is' as a special case
153153
desc = desc.replace('i.e.', 'ᚃ')
154+
desc = desc.replace('that is,', 'ᚄ')
154155

155156
# process the description:
156157
# remove text in parentheses (except when it's within a tags
@@ -177,7 +178,9 @@ def process_description(el):
177178
if open_paren_count == 0:
178179
end = t.start()+1
179180
text = desc[last_paren_open:end]
180-
if text.find('ᚃ') != -1 or len(text) > min_paren_size:
181+
if (text.find('ᚃ') != -1 or
182+
text.find('ᚄ') != -1 or
183+
len(text) > min_paren_size):
181184
del_ranges.append((last_paren_open, t.start()+1))
182185

183186
else:
@@ -246,22 +249,30 @@ def process_description(el):
246249
# limit, then try to cut desc at "i.e." if present. Otherwise, cut desc
247250
# in the middle of the sentence, preferably at the end of a word
248251
if first_dot == -1 or first_dot > len(desc):
249-
iepos = desc.rfind('ᚃ')
250-
if iepos != -1 and iepos > 2:
252+
253+
#find the last match
254+
m = None
255+
for m in re.finditer('[ᚃᚄ]', desc):
256+
pass
257+
if m and m.start() > 2:
258+
pos = m.start()
259+
char = m.group(0)
260+
251261
# string is too long but we can cut it at 'i.e.'
252-
if desc[iepos-2:iepos+1] == ', ᚃ':
253-
desc = desc[:iepos-2] + '.'
254-
elif desc[iepos-2:iepos+1] == ' ,ᚃ':
255-
desc = desc[:iepos-2] + '.'
256-
elif desc[iepos-1:iepos+1] == ',ᚃ':
257-
desc = desc[:iepos-1] + '.'
258-
elif desc[iepos-1:iepos+1] == ' ᚃ':
259-
desc = desc[:iepos-1] + '.'
262+
if desc[pos-2:pos+1] == ', '+char:
263+
desc = desc[:pos-2] + '.'
264+
elif desc[pos-2:pos+1] == ' ,'+char:
265+
desc = desc[:pos-2] + '.'
266+
elif desc[pos-1:pos+1] == ','+char:
267+
desc = desc[:pos-1] + '.'
268+
elif desc[pos-1:pos+1] == ' '+char:
269+
desc = desc[:pos-1] + '.'
260270
else:
261-
desc = desc[:iepos]
271+
desc = desc[:pos]
262272
else:
263273
# open_count != 0 means that we are not within a word already
264274
if open_count == 0:
275+
m = None
265276
for m in re.finditer('[\s.]+', desc):
266277
pass
267278
if m:
@@ -271,6 +282,7 @@ def process_description(el):
271282
else:
272283
desc = desc[:first_dot] + '.'
273284
desc = desc.replace('ᚃ', 'i.e.')
285+
desc = desc.replace('ᚄ', 'that is,')
274286
return desc
275287

276288
''' Returns a short description of a feature. This is the first sentence after

0 commit comments

Comments
 (0)