Skip to content

Commit 2d77873

Browse files
author
p12
committed
Transform/DDG: several fixes in the handling of the description
* remove text in parentheses * properly ignore the size of the markup tags in the calculation of the limit * make sure we don't cut the description at a markup tag * make sure we don't cur the description at the middle of a word
1 parent 4fcc61e commit 2d77873

File tree

1 file changed

+73
-17
lines changed

1 file changed

+73
-17
lines changed

ddg_parse_html.py

Lines changed: 73 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -148,46 +148,96 @@ def process_description(el):
148148
# Handle 'i.e.' as a special case
149149
desc = desc.replace('i.e.', 'ᚃ')
150150

151+
# process the description:
152+
# remove text in parentheses (except when it's within a tags
153+
# get the position of the cut of the description
154+
155+
open_count = 0
156+
open_paren_count = 0
157+
158+
del_ranges = []
159+
160+
# remove parentheses
161+
for t in re.finditer('(<code>|</code>|<i>|</i>|<b>|</b>|\(|\))', desc):
162+
mt = t.group(1)
163+
164+
if mt == '(':
165+
if open_count == 0:
166+
open_paren_count += 1
167+
if open_paren_count == 1:
168+
last_paren_open = t.start()
169+
170+
elif mt == ')':
171+
if open_count == 0 and open_paren_count > 0:
172+
open_paren_count -= 1
173+
if open_paren_count == 0:
174+
del_ranges.append((last_paren_open, t.start()+1))
175+
176+
else:
177+
if mt[1] != '/':
178+
open_count += 1
179+
else:
180+
open_count -= 1
181+
182+
for r in reversed(del_ranges):
183+
begin,end = r
184+
desc = desc[:begin] + desc[end:]
185+
151186
# limit the number of characters
152187
num_code = desc.count('<code>')
153188
num_i = desc.count('<i>')
154189
num_b = desc.count('<b>')
155190
limit = char_limit + num_code * 13 + num_i * 7 + num_b * 7
156191
desc = desc[:limit]
157192

158-
# find the last dot, remove broken tags
193+
# find the first dot, actual limit when ignoring the tags
159194
last_open = -1
160-
last_close = -1
195+
last_close = 0
161196
open_count = 0
162197
first_dot = -1
163198

164-
for t in re.finditer('<(/?(?:code|i|b))>', desc):
165-
if t.group(1)[0] != '/':
199+
curr_limit= char_limit
200+
201+
for t in re.finditer('(<code>|</code>|<i>|</i>|<b>|</b>)', desc):
202+
mt = t.group(1)
203+
204+
if t.start() > curr_limit + len(mt):
205+
break
166206

207+
curr_limit += len(mt)
208+
209+
if t.group(1)[1] != '/':
167210
if open_count == 0:
168211
last_open = t.start()
169212
# find any dots in the top level text
170-
if last_close != -1:
171-
pos = desc[last_close:last_open].rfind('.')
172-
if pos != -1 and first_dot == -1:
173-
first_dot = last_close + pos
213+
pos = desc[last_close:last_open].find('.')
214+
if pos != -1 and first_dot == -1:
215+
first_dot = last_close + pos
216+
174217
open_count += 1
175218

176219
else:
177220
open_count -= 1
178221
if open_count == 0:
179222
last_close = t.start()
180223

181-
if open_count > 0:
224+
# find dot if there were no tags (last_close == 0) or in the range after
225+
# the last close tag
226+
if first_dot == -1:
227+
pos = desc[last_close:].find('.')
228+
if pos != -1:
229+
first_dot = last_close + pos
230+
231+
# limit desc to the adjusted limit
232+
# additionally strip unclosed tags (last_open < curr_limit)
233+
if open_count == 0:
234+
desc = desc[:curr_limit]
235+
else:
182236
desc = desc[:last_open]
183237

184-
if last_close == -1:
185-
last_close = 0
186-
187-
pos = desc[last_close:].rfind('.')
188-
if pos != -1 and first_dot == -1:
189-
first_dot = last_close + pos
190-
238+
# limit desc to the first sentence. If first sentence is longer than the
239+
# limit, then try to cut desc at "i.e." if present. Otherwise, cut desc
240+
# in the middle of the sentence, preferably at the end of a word
191241
if first_dot == -1 or first_dot > len(desc):
192242
iepos = desc.rfind('ᚃ')
193243
if iepos != -1 and iepos > 2:
@@ -203,13 +253,19 @@ def process_description(el):
203253
else:
204254
desc = desc[:iepos]
205255
else:
256+
# open_count != 0 means that we are not within a word already
257+
if open_count == 0:
258+
for m in re.finditer('[\s.]+', desc):
259+
pass
260+
if m:
261+
desc = desc[:m.start()]
262+
206263
desc = desc + '...'
207264
else:
208265
desc = desc[:first_dot] + '.'
209266
desc = desc.replace('ᚃ', 'i.e.')
210267
return desc
211268

212-
213269
''' Returns a short description of a feature. This is the first sentence after
214270
the declaration (dcl template). If a list follows immediately, then the
215271
description is picked from a list item identified by num

0 commit comments

Comments
 (0)