Skip to content
This repository was archived by the owner on Sep 18, 2025. It is now read-only.

Commit b80434e

Browse files
authored
Timeout bug with section alignment (#80)
* Disassociates sections from headings to deal with bug that messes up section alignment dictionary on StructuredDiffer when timeout is triggered and sections were changed. Simplifies code/logic nicely too. Now the tree differ detects section changes so some tests required updating too. Getting slightly different behavior on two section moving tests too though it's within the realm of acceptable.
1 parent 4ee519b commit b80434e

File tree

5 files changed

+63
-63
lines changed

5 files changed

+63
-63
lines changed

mwedittypes/node_differ.py

Lines changed: 24 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -272,8 +272,8 @@ def get_diff_count(result, lang='en'):
272272
"""
273273
node_edits = []
274274
text_edits = []
275+
section_edits = []
275276
context = []
276-
section_titles = set()
277277
prev_text = []
278278
curr_text = []
279279

@@ -292,12 +292,13 @@ def get_diff_count(result, lang='en'):
292292
continue
293293
else:
294294
tf_removes.add(text)
295-
section_titles.add(r['section'])
296295
# if node is text, just check whether there's anything and retain for later
297296
# because all the text is processed at once at the end
298297
if et == 'Text' and text:
299298
prev_text.append(text)
300299
# non-text node: verify/fine-tune the edit type and add to results dictionary
300+
elif et == 'Section':
301+
section_edits.append({'edittype':'remove', 'section':r['section']})
301302
else:
302303
name, changes = get_node_diff(node_type=et, prev_wikitext=text, curr_wikitext='', lang=lang)
303304
node_edits.append(NodeEdit(et, 'remove', r['section'], name, changes))
@@ -311,9 +312,10 @@ def get_diff_count(result, lang='en'):
311312
continue
312313
else:
313314
tf_inserts.add(text)
314-
section_titles.add(i['section'])
315315
if et == 'Text' and text:
316316
curr_text.append(text)
317+
elif et == 'Section':
318+
section_edits.append({'edittype': 'insert', 'section': i['section']})
317319
else:
318320
name, changes = get_node_diff(node_type=et, prev_wikitext='', curr_wikitext=text, lang=lang)
319321
node_edits.append(NodeEdit(et, 'insert', i['section'], name, changes))
@@ -328,11 +330,11 @@ def get_diff_count(result, lang='en'):
328330
continue
329331
else:
330332
tf_changes.add(ptext)
331-
section_titles.add(c['curr']['section'])
332-
section_titles.add(c['prev']['section'])
333333
if et == 'Text' and ptext != ctext:
334334
prev_text.append(ptext)
335335
curr_text.append(ctext)
336+
elif et == 'Section':
337+
section_edits.append({'edittype': 'change', 'section': c['prev']['section']})
336338
else:
337339
name, changes = get_node_diff(node_type=et, prev_wikitext=ptext, curr_wikitext=ctext, lang=lang)
338340
node_edits.append(NodeEdit(et, 'change', c['prev']['section'], name, changes))
@@ -347,10 +349,11 @@ def get_diff_count(result, lang='en'):
347349
continue
348350
else:
349351
tf_moves.add(ptext)
350-
section_titles.add(m['curr']['section'])
351-
section_titles.add(m['prev']['section'])
352-
name, changes = get_node_diff(node_type=et, prev_wikitext=ptext, curr_wikitext=ctext, lang=lang)
353-
node_edits.append(NodeEdit(et, 'move', m['prev']['section'], name, changes))
352+
if et == 'Section':
353+
section_edits.append({'edittype': 'move', 'section': m['prev']['section']})
354+
else:
355+
name, changes = get_node_diff(node_type=et, prev_wikitext=ptext, curr_wikitext=ctext, lang=lang)
356+
node_edits.append(NodeEdit(et, 'move', m['prev']['section'], name, changes))
354357

355358
# give raw insert/remove counts
356359
# changes can be assumed to be overlap between insert+remove -- e.g., 5 insert and 3 remove -> 3 change, 2 insert
@@ -365,16 +368,17 @@ def get_diff_count(result, lang='en'):
365368
elif et_count < 0:
366369
text_edits.append(TextEdit(text_subcat, 'remove', txt, abs(et_count)))
367370

368-
# identify how many sections have been edited based on a few heuristics
369-
if section_titles:
370-
sec_remove = sum([1 for n in node_edits if n.type == 'Heading' and n.edittype == 'remove']) + int('curr-no-content' in result)
371-
sec_insert = sum([1 for n in node_edits if n.type == 'Heading' and n.edittype == 'insert']) + int('prev-no-content' in result)
372-
sec_change = len(section_titles) - sec_remove - sec_insert
373-
if sec_remove:
374-
context.append(Context('Section', 'remove', sec_remove))
375-
if sec_insert:
376-
context.append(Context('Section', 'insert', sec_insert))
377-
if sec_change:
378-
context.append(Context('Section', 'change', sec_change))
371+
sec_remove = sum([1 for n in section_edits if n['edittype'] == 'remove'])
372+
sec_insert = sum([1 for n in section_edits if n['edittype'] == 'insert'])
373+
sec_move = sum([1 for n in section_edits if n['edittype'] == 'move'])
374+
sec_change = sum([1 for n in section_edits if n['edittype'] == 'change'])
375+
if sec_remove:
376+
context.append(Context('Section', 'remove', sec_remove))
377+
if sec_insert:
378+
context.append(Context('Section', 'insert', sec_insert))
379+
if sec_change:
380+
context.append(Context('Section', 'change', sec_change))
381+
if sec_move:
382+
context.append(Context('Section', 'move', sec_move))
379383

380384
return {'node-edits': node_edits, 'text-edits': text_edits, 'context': context}

mwedittypes/tree_differ.py

Lines changed: 11 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,6 @@ def get_diff(prev_wikitext, curr_wikitext, lang='en', timeout=False):
1515
d = Differ(prev_tree, curr_tree, timeout=timeout)
1616
diff = d.get_corresponding_nodes()
1717
result = diff.post_process(prev_tree.secname_to_text, curr_tree.secname_to_text, lang=lang)
18-
# this helps the node differ know that the lede is also a new section
19-
# otherwise depends on headings to track changes to sections
20-
if not prev_wikitext:
21-
result['prev-no-content'] = True
22-
if not curr_wikitext:
23-
result['curr-no-content'] = True
2418
return result
2519

2620

@@ -137,7 +131,8 @@ def __init__(self, wikitext, lang="en"):
137131
self.lang = lang
138132
self.root = OrderedNode('root', ntype="Article")
139133
self.secname_to_text = {}
140-
self.wikitext_to_tree(wikitext)
134+
if wikitext:
135+
self.wikitext_to_tree(wikitext)
141136
self.key_roots = None
142137

143138
def wikitext_to_tree(self, wikitext):
@@ -303,12 +298,6 @@ def get_node_distance(self, n1, n2):
303298
return self.nodetype_chg_cost
304299
elif n1.content_hash == n2.content_hash:
305300
return 0
306-
# if both sections, assert no cost (changes will be captured by headings)
307-
# except we want to detect moves of sections
308-
elif n1.ntype == 'Section':
309-
if not n1.children and not n2.children:
310-
return self.chg_cost
311-
return 0
312301
# otherwise, same node types and not the same, then change cost
313302
else:
314303
return self.chg_cost
@@ -399,11 +388,7 @@ def find_minimum_transactions(self, kr1, kr2, transactions):
399388
transactions[i][j] = cc
400389

401390
def get_corresponding_nodes(self):
402-
"""Explain transactions.
403-
404-
Skip 'Section' operations as they aren't real nodes and
405-
any changes to them will be captured via Headings etc.
406-
"""
391+
"""Explain transactions."""
407392
if self.transactions:
408393
transactions = self.transactions[len(self.t1) - 1][len(self.t2) - 1]
409394
remove = []
@@ -412,18 +397,14 @@ def get_corresponding_nodes(self):
412397
for i in range(0, len(transactions)):
413398
if transactions[i][0] is None:
414399
ins_node = self.t2[transactions[i][1]]
415-
# this second clause allows us to later detect moved sections
416-
if ins_node.ntype != 'Section' or not ins_node.children:
417-
insert.append(ins_node)
400+
insert.append(ins_node)
418401
elif transactions[i][1] is None:
419402
rem_node = self.t1[transactions[i][0]]
420-
if rem_node.ntype != 'Section' or not rem_node.children:
421-
remove.append(rem_node)
403+
remove.append(rem_node)
422404
else:
423405
prev_node = self.t1[transactions[i][0]]
424406
curr_node = self.t2[transactions[i][1]]
425-
if prev_node.ntype != 'Section' or not prev_node.children:
426-
change.append((prev_node, curr_node))
407+
change.append((prev_node, curr_node))
427408
diff = {'remove': remove, 'insert': insert, 'change': change}
428409
self.detect_moves(diff)
429410
return Diff(nodes_removed=diff['remove'], nodes_inserted=diff['insert'],
@@ -565,17 +546,18 @@ def _section_mapping(self, sections_prev, sections_curr):
565546
removed = []
566547
# removed sections map to null in current; inserted sections map to null in previous
567548
for n in self.remove:
568-
if n['type'] == 'Heading':
549+
if n['type'] == 'Section':
569550
for i, s in enumerate(prev):
570551
if s == n['section']:
571552
removed.append(i)
572553
break
573554
for idx in sorted(removed, reverse=True):
574555
p_to_c[prev[idx]] = None
575556
prev.pop(idx)
557+
576558
inserted = []
577559
for n in self.insert:
578-
if n['type'] == 'Heading':
560+
if n['type'] == 'Section':
579561
for i, s in enumerate(curr):
580562
if s == n['section']:
581563
inserted.append(i)
@@ -584,12 +566,13 @@ def _section_mapping(self, sections_prev, sections_curr):
584566
c_to_p[curr[idx]] = None
585567
curr.pop(idx)
586568

569+
587570
# changes happen in place so don't effect structure of doc and can be ignored
588571
# for moved sections, reorder mapping so they are aligned again for dumping
589572
for c in self.move:
590573
pn = c['prev']
591574
cn = c['curr']
592-
if pn['type'] == 'Heading':
575+
if pn['type'] == 'Section':
593576
prev_idx = None
594577
curr_idx = None
595578
for i, s in enumerate(prev):

mwedittypes/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def extract_text(mwnode, lang='en'):
9191
return mwnode.contents.strip_code(collapse=False)
9292
elif ntype == 'Text Formatting':
9393
return ''.join(extract_text(mwn) for mwn in mwnode.contents.nodes)
94-
# Heading, Template, Comment, Argument, Category, Media, References, URLs without display text
94+
# Article, Section, Heading, Template, Comment, Argument, Category, Media, References, URLs without display text
9595
# Tags not listed here (div, gallery, etc.) that almost never have true text content and can be super messy
9696
# Table elements (they duplicate the text if included)
9797
else:

tests/test_edittypes_summary.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -69,12 +69,12 @@ def test_text_within_formatting():
6969
full_diff = StructuredEditTypes(prev_wikitext, curr_wikitext, lang='en').get_diff()
7070
assert full_diff_to_simple(full_diff) == expected_changes
7171

72-
@pytest.mark.xfail(reason="Tree Differ can't currently detect simple shifts in text formatting start-stops.")
72+
@pytest.mark.xfail(reason="Differs can't currently detect simple shifts in text formatting start-stops.")
7373
def test_change_formatting_length():
7474
curr_wikitext = prev_wikitext.replace("'''Karl Josef Aigen''' (8 October 1684 – 22 October 1762) was a landscape painter, born at",
7575
"'''Karl Josef Aigen (8 October 1684 – 22 October 1762) was a landscape painter, born at'''",
7676
1)
77-
expected_changes = {'Section': {'change': 1}}
77+
expected_changes = {'Section': {'change': 1}, 'Text Formatting': {'change': 1}}
7878
diff = SimpleEditTypes(prev_wikitext, curr_wikitext, lang='en').get_diff()
7979
assert diff == expected_changes
8080
full_diff = StructuredEditTypes(prev_wikitext, curr_wikitext, lang='en')
@@ -299,11 +299,15 @@ def test_complicated_sections():
299299
curr_wikitext = curr_wikitext.replace("[[Category:Artists from Olomouc]]",
300300
"[[Category:Artists in Olomouc]]",
301301
1)
302-
expected_changes = {'Heading': {'insert': 1}, 'Category': {'change': 1}, 'Section': {'insert': 1, 'change': 2}}
302+
simple_expected_changes = {'Heading': {'insert': 1}, 'Category': {'change': 1},
303+
'Section': {'insert': 1, 'change': 2}}
303304
diff = SimpleEditTypes(prev_wikitext, curr_wikitext, lang='en').get_diff()
304-
assert diff == expected_changes
305+
assert diff == simple_expected_changes
306+
# NOTE: simple expected changes are a bit more realistic but this is acceptable
307+
structured_expected_changes = {'Heading': {'insert': 1}, 'Category': {'change': 1},
308+
'Section': {'insert': 2, 'change': 1, 'remove': 1}}
305309
full_diff = StructuredEditTypes(prev_wikitext, curr_wikitext, lang='en').get_diff()
306-
assert full_diff_to_simple(full_diff) == expected_changes
310+
assert full_diff_to_simple(full_diff) == structured_expected_changes
307311

308312

309313
def test_moved_section():
@@ -313,7 +317,7 @@ def test_moved_section():
313317
"\n\n==References==\n{{reflist}}\n\n===Works===\nThe [[Österreichische Galerie Belvedere|Gallery of the Belvedere]] in Vienna has two works by him, both scenes with figures.<ref>{{Bryan (3rd edition)|title=Aigen, Karl |volume=1}}</ref>",
314318
1)
315319
simple_expected_changes = {}
316-
full_expected_changes = {'Section':{'move':1, 'change':2}}
320+
full_expected_changes = {'Section':{'move':1}}
317321
diff = SimpleEditTypes(prev_wikitext, curr_wikitext, lang='en').get_diff()
318322
assert diff == simple_expected_changes
319323
full_diff = StructuredEditTypes(prev_wikitext, curr_wikitext, lang='en').get_diff()

tests/test_tree_differ.py

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@ def test_insert_category():
3131
curr_wikitext = prev_wikitext.replace('[[Category:Artists from Olomouc]]\n',
3232
'[[Category:Artists from Olomouc]]\n[[Category:TEST CATEGORY]]',
3333
1)
34-
expected_changes = [('insert', '4: ==External links==', 'Category')]
34+
expected_changes = [('insert', '4: ==External links==', 'Category'),
35+
('change', '4: ==External links==', 'Section')]
3536
diff = StructuredEditTypes(prev_wikitext, curr_wikitext, lang='en')
3637
diff.get_diff()
3738
check_change_counts(diff.tree_diff, expected_changes)
@@ -41,7 +42,8 @@ def test_insert_link():
4142
curr_wikitext = prev_wikitext.replace('He was a pupil of the Olomouc painter',
4243
'He was a [[pupil]] of the Olomouc painter',
4344
1)
44-
expected_changes = [('insert', '1: ==Life==', 'Wikilink')]
45+
expected_changes = [('insert', '1: ==Life==', 'Wikilink'),
46+
('change', '1: ==Life==', 'Section')]
4547
diff = StructuredEditTypes(prev_wikitext, curr_wikitext, lang='en')
4648
diff.get_diff()
4749
check_change_counts(diff.tree_diff, expected_changes)
@@ -52,7 +54,9 @@ def test_move_template():
5254
'',
5355
1)
5456
curr_wikitext = '{{Austria-painter-stub}}' + curr_wikitext
55-
expected_changes = [('move', '4: ==External links==', 'Template')]
57+
expected_changes = [('move', '4: ==External links==', 'Template'),
58+
('change', '4: ==External links==', 'Section'),
59+
('change', '0: Lede', 'Section')]
5660
diff = StructuredEditTypes(prev_wikitext, curr_wikitext, lang='en')
5761
diff.get_diff()
5862
check_change_counts(diff.tree_diff, expected_changes)
@@ -62,7 +66,8 @@ def test_change_heading():
6266
curr_wikitext = prev_wikitext.replace('===Works===',
6367
'===NotWorks===',
6468
1)
65-
expected_changes = [('change', '2: ===Works===', 'Heading')]
69+
expected_changes = [('change', '2: ===Works===', 'Heading'),
70+
('change', '2: ===Works===', 'Section')]
6671
diff = StructuredEditTypes(prev_wikitext, curr_wikitext, lang='en')
6772
diff.get_diff()
6873
check_change_counts(diff.tree_diff, expected_changes)
@@ -73,7 +78,8 @@ def test_remove_formatting():
7378
"Karl Josef Aigen",
7479
1)
7580
# two tf-spans removed; won't be reduced to a single block until the node differ
76-
expected_changes = [('remove', '0: Lede', 'Text Formatting'), ('remove', '0: Lede', 'Text Formatting')]
81+
expected_changes = [('remove', '0: Lede', 'Text Formatting'), ('remove', '0: Lede', 'Text Formatting'),
82+
('change', '0: Lede', 'Section')]
7783
diff = StructuredEditTypes(prev_wikitext, curr_wikitext, lang='en')
7884
diff.get_diff()
7985
check_change_counts(diff.tree_diff, expected_changes)
@@ -114,7 +120,8 @@ def test_insert_table():
114120
('insert', '4: ==External links==', 'Text Formatting'),
115121
('insert', '4: ==External links==', 'Text Formatting'),
116122
('insert', '4: ==External links==', 'Text Formatting'),
117-
('insert', '4: ==External links==', 'Text Formatting')]
123+
('insert', '4: ==External links==', 'Text Formatting'),
124+
('change', '4: ==External links==', 'Section')]
118125
diff = StructuredEditTypes(prev_wikitext, curr_wikitext, lang='en')
119126
diff.get_diff()
120127
check_change_counts(diff.tree_diff, expected_changes)
@@ -133,7 +140,8 @@ def test_insert_gallery():
133140
('insert', '4: ==External links==', 'Media'),
134141
('insert', '4: ==External links==', 'Wikilink'),
135142
('insert', '4: ==External links==', 'Text Formatting'),
136-
('insert', '4: ==External links==', 'Text Formatting')]
143+
('insert', '4: ==External links==', 'Text Formatting'),
144+
('change', '4: ==External links==', 'Section')]
137145
diff = StructuredEditTypes(prev_wikitext, curr_wikitext, lang='en')
138146
diff.get_diff()
139147
check_change_counts(diff.tree_diff, expected_changes)
@@ -142,7 +150,8 @@ def test_insert_gallery():
142150
def test_table_change():
143151
curr_wikitext = table.replace('general election', 'gen elec', 1)
144152
expected_changes = [('change', '0: Lede', 'Wikilink'),
145-
('change', '0: Lede', 'Table')]
153+
('change', '0: Lede', 'Table'),
154+
('change', '0: Lede', 'Section')]
146155
diff = StructuredEditTypes(table, curr_wikitext, lang='en')
147156
diff.get_diff()
148157
check_change_counts(diff.tree_diff, expected_changes)

0 commit comments

Comments
 (0)