Skip to content

Commit 565b10e

Browse files
committed
Improve parser for references in Internet-Drafts. Fixes ietf-tools#2360
- Legacy-Id: 14851
1 parent 49f00b7 commit 565b10e

File tree

1 file changed

+89
-65
lines changed

1 file changed

+89
-65
lines changed

ietf/utils/draft.py

Lines changed: 89 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1019,79 +1019,103 @@ def get_title(self):
10191019

10201020
# ------------------------------------------------------------------
10211021
def get_refs(self):
1022-
refType = 'unk'
1023-
refs = {}
1024-
typemap = {
1025-
'normative': 'norm',
1026-
'informative': 'info',
1027-
'informational': 'info',
1028-
'non-normative': 'info',
1029-
None: 'old'
1030-
}
1031-
# Bill's horrible "references section" regexps, built up over lots of years
1032-
# of fine tuning for different formats.
1033-
# Examples:
1034-
# Appendix A. References:
1035-
# A.1. Informative References:
1036-
sectionre = re.compile( r'(?i)(?:Appendix\s+)?(?:(?:[A-Z]\.)?[0-9.]*\s+)?(?:(\S+)\s*)?references:?$' )
1037-
# 9.1 Normative
1038-
sectionre2 = re.compile( r'(?i)(?:(?:[A-Z]\.)?[0-9.]*\s+)?(\S+ormative)$' )
1039-
# One other reference section type seen:
1040-
sectionre3 = re.compile( r'(?i)References \((\S+ormative)\)$' )
1041-
# An Internet-Draft reference.
1042-
idref = re.compile( r'(?i)\b(draft-(?:[-\w]+(?=-\d\d)|[-\w]+))(-\d\d)?\b' )
1043-
# An RFC-and-other-series reference.
1044-
rfcref = re.compile( r'(?i)\b(rfc|std|bcp|fyi)[- ]?(\d+)\b' )
1022+
# Bill's horrible "references section" regexps, built up over lots of years
1023+
# of fine tuning for different formats.
1024+
# Examples:
1025+
# Appendix A. References:
1026+
# A.1. Informative References:
1027+
sectionre = re.compile( r'(?i)(?:Appendix\s+)?(?:(?:[A-Z]\.)?[0-9.]*\s+)?(?:(\S+)\s*)?references:?$' )
1028+
# 9.1 Normative
1029+
sectionre2 = re.compile( r'(?i)(?:(?:[A-Z]\.)?[0-9.]*\s+)?(\S+ormative)$' )
1030+
# One other reference section type seen:
1031+
sectionre3 = re.compile( r'(?i)References \((\S+ormative)\)$' )
1032+
# An Internet-Draft reference.
1033+
idref = re.compile( r'(?i)\b(draft-(?:[-\w]+(?=-\d\d)|[-\w]+))(-\d\d)?\b' )
1034+
# An RFC-and-other-series reference.
1035+
rfcref = re.compile( r'(?i)\b(rfc|std|bcp|fyi)[- ]?(\d+)\b' )
10451036
# False positives for std
10461037
not_our_std_ref = re.compile( r'(?i)((\b(n?csc|fed|mil|is-j)-std\b)|(\bieee\s*std\d*\b)|(\bstd\s+802\b))' )
1047-
# An Internet-Draft or series reference hyphenated by a well-meaning line break.
1048-
eol = re.compile( r'(?i)\b(draft[-\w]*-|rfc|std|bcp|fyi)$' )
1038+
# An Internet-Draft or series reference hyphenated by a well-meaning line break.
1039+
eol = re.compile( r'(?i)\b(draft[-\w]*-|rfc|std|bcp|fyi)$' )
10491040
# std at the front of a line can hide things like IEEE STD or MIL-STD
10501041
std_start = re.compile( r'(?i)std\n*\b' )
10511042

1052-
for i in range( 15, len( self.lines ) ):
1053-
line = self.lines[ i ].strip()
1054-
m = sectionre.match( line )
1055-
if m:
1056-
match = m.group( 1 )
1057-
if match is not None:
1058-
match = match.lower()
1059-
refType = typemap.get( match, 'unk' )
1060-
continue
1061-
m = sectionre2.match( line )
1062-
if m:
1063-
refType = typemap.get( m.group( 1 ).lower(), 'unk' )
1064-
continue
1065-
m = sectionre3.match( line )
1066-
if m:
1067-
refType = typemap.get( m.group( 1 ).lower(), 'unk' )
1068-
continue
1069-
# If something got split badly, rejoin it.
1070-
if eol.search( line ) and i < len( self.lines ) - 1:
1071-
line += self.lines[ i + 1 ].lstrip()
1072-
m = idref.search( line )
1073-
if m:
1074-
draft = m.group( 1 )
1075-
refs[ draft ] = refType
1076-
continue
1077-
m = rfcref.search( line )
1078-
if m:
1079-
( series, number ) = m.groups()
1080-
if series.lower()=='std' and std_start.search(line) and i > 15:
1081-
line = self.lines[i-1].rstrip()+line
1082-
if series.lower()!='std' or not not_our_std_ref.search( line ):
1083-
name = series.lower() + number.lstrip( '0' )
1084-
refs[ name ] = refType
1085-
continue
1086-
# References to BCP78 and BCP79 in boilerplate will appear as "unk".
1087-
# Remove them.
1088-
for boilerplate in ( 'bcp78', 'bcp79' ):
1089-
if refs.get( boilerplate ) == 'unk':
1090-
del refs[ boilerplate ]
1043+
refs = {}
1044+
in_ref_sect = False
1045+
in_norm_ref_sect = False
1046+
refType = 'unk'
1047+
1048+
for i in range( 15, len( self.lines ) ):
1049+
line = self.lines[ i ].strip()
1050+
1051+
# skip over lines until we find the start of the reference section
1052+
if not in_ref_sect:
1053+
m = sectionre.match( line )
1054+
if not m:
1055+
m = sectionre2.match( line )
1056+
if not m:
1057+
m = sectionre3.match( line )
1058+
1059+
if m:
1060+
in_ref_sect = True
1061+
refType = 'info'
1062+
if line.lower().find("normative") > 1:
1063+
in_norm_ref_sect = True
1064+
refType = 'norm'
1065+
1066+
# might be subsections within a references section
1067+
if in_ref_sect and not in_norm_ref_sect:
1068+
m = sectionre.match( line )
1069+
if not m:
1070+
m = sectionre2.match( line )
1071+
if not m:
1072+
m = sectionre3.match( line )
1073+
1074+
if m:
1075+
in_ref_sect = True
1076+
if line.lower().find("normative") > 1:
1077+
in_norm_ref_sect = True
1078+
refType = 'norm'
1079+
1080+
# look for the end of the normative reference section
1081+
if in_norm_ref_sect:
1082+
m = sectionre.match( line )
1083+
if not m:
1084+
m = sectionre2.match( line )
1085+
if not m:
1086+
m = sectionre3.match( line )
1087+
1088+
if m and line.lower().find("normative") < 0:
1089+
in_norm_ref_sect = False
1090+
refType = 'info'
1091+
1092+
# find references within the section
1093+
if in_ref_sect:
1094+
# If something got split badly, rejoin it.
1095+
if eol.search( line ) and i < len( self.lines ) - 1:
1096+
line += self.lines[ i + 1 ].lstrip()
1097+
1098+
m = idref.search( line )
1099+
if m:
1100+
draft = m.group( 1 )
1101+
if draft not in refs:
1102+
refs[ draft ] = refType
1103+
1104+
m = rfcref.search( line )
1105+
if m:
1106+
( series, number ) = m.groups()
1107+
if series.lower()=='std' and std_start.search(line) and i > 15:
1108+
line = self.lines[i-1].rstrip()+line
1109+
if series.lower()!='std' or not not_our_std_ref.search( line ):
1110+
name = series.lower() + number.lstrip( '0' )
1111+
if name not in refs:
1112+
refs[ name ] = refType
1113+
10911114
# Don't add any references that point back into this doc
10921115
if self.filename in refs:
10931116
del refs[self.filename]
1094-
return refs
1117+
1118+
return refs
10951119

10961120
def old_get_refs( self ):
10971121
refs = []

0 commit comments

Comments
 (0)