forked from adamlaska/datatracker
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbackfill_data.py
More file actions
executable file
·184 lines (144 loc) · 6.52 KB
/
backfill_data.py
File metadata and controls
executable file
·184 lines (144 loc) · 6.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#!/usr/bin/env python
# Copyright The IETF Trust 2017-2020, All Rights Reserved
# -*- coding: utf-8 -*-
import io
import sys
import os
import os.path
import argparse
import time
from typing import Set, Optional # pyflakes:ignore
basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
sys.path = [ basedir ] + sys.path
os.environ["DJANGO_SETTINGS_MODULE"] = "ietf.settings"
virtualenv_activation = os.path.join(basedir, "env", "bin", "activate_this.py")
if os.path.exists(virtualenv_activation):
exec(compile(io.open(virtualenv_activation, "rb").read(), virtualenv_activation, 'exec'), dict(__file__=virtualenv_activation))
import django
django.setup()
from django.conf import settings
import debug # pyflakes:ignore
from ietf.doc.models import Document
from ietf.name.models import FormalLanguageName
from ietf.utils.draft import PlaintextDraft
parser = argparse.ArgumentParser()
parser.add_argument("--document", help="specific document name")
parser.add_argument("--words", action="store_true", help="fill in word count")
parser.add_argument("--formlang", action="store_true", help="fill in formal languages")
parser.add_argument("--authors", action="store_true", help="fill in author info")
args = parser.parse_args()
formal_language_dict = { l.pk: l for l in FormalLanguageName.objects.all() }
docs_qs = Document.objects.filter(type="draft")
if args.document:
docs_qs = docs_qs.filter(docalias__name=args.document)
ts = time.strftime("%Y-%m-%d_%H:%M%z")
logfile = io.open('backfill-authorstats-%s.log'%ts, 'w')
print("Writing log to %s" % os.path.abspath(logfile.name))
def say(msg):
msg = msg.encode('utf8')
sys.stderr.write(msg)
sys.stderr.write('\n')
logfile.write(msg)
logfile.write('\n')
def unicode(text):
if text is None:
return text
# order matters here:
for encoding in ['ascii', 'utf8', 'latin1', ]:
try:
utext = text.decode(encoding)
# if encoding == 'latin1':
# say("Warning: falling back to latin1 decoding for %s ..." % utext[:216]])
return utext
except UnicodeDecodeError:
pass
start = time.time()
say("Running query for documents to process ...")
for doc in docs_qs.prefetch_related("docalias", "formal_languages", "documentauthor_set", "documentauthor_set__person", "documentauthor_set__person__alias_set"):
canonical_name = doc.name
for n in doc.docalias.all():
if n.name.startswith("rfc"):
canonical_name = n.name
if canonical_name.startswith("rfc"):
path = os.path.join(settings.RFC_PATH, canonical_name + ".txt")
else:
path = os.path.join(settings.INTERNET_ALL_DRAFTS_ARCHIVE_DIR, canonical_name + "-" + doc.rev + ".txt")
if not os.path.exists(path):
say("Skipping %s, no txt file found at %s" % (doc.name, path))
continue
with io.open(path, 'rb') as f:
say("\nProcessing %s" % doc.name)
sys.stdout.flush()
d = PlaintextDraft(unicode(f.read()), path)
updated = False
updates = {}
if args.words:
words = d.get_wordcount()
if words != doc.words:
updates["words"] = words
if args.formlang:
langs = d.get_formal_languages()
new_formal_languages = set(formal_language_dict[l] for l in langs)
old_formal_languages = set(doc.formal_languages.all())
if new_formal_languages != old_formal_languages:
for l in new_formal_languages - old_formal_languages:
doc.formal_languages.add(l)
updated = True
for l in old_formal_languages - new_formal_languages:
doc.formal_languages.remove(l)
updated = True
if args.authors:
old_authors = doc.documentauthor_set.all()
old_authors_by_name = {}
old_authors_by_email = {}
for author in old_authors:
for alias in author.person.alias_set.all():
old_authors_by_name[alias.name] = author
old_authors_by_name[author.person.plain_name()] = author
if author.email_id:
old_authors_by_email[author.email_id] = author
# the draft parser sometimes has a problem when
# affiliation isn't in the second line and it then thinks
# it's an extra author - skip those extra authors
seen = set() # type: Set[Optional[str]]
for full, _, _, _, _, email, country, company in d.get_author_list():
assert full is None or isinstance(full, str)
assert email is None or isinstance(email, str)
assert country is None or isinstance(country, str)
assert isinstance(company, str)
#full, email, country, company = [ unicode(s) for s in [full, email, country, company, ] ]
if email in seen:
continue
seen.add(email)
old_author = None
if email:
old_author = old_authors_by_email.get(email)
if not old_author:
old_author = old_authors_by_name.get(full)
if not old_author:
say("UNKNOWN AUTHOR: %s, %s, %s, %s, %s" % (doc.name, full, email, country, company))
continue
if old_author.affiliation != company:
say("new affiliation: %s [ %s <%s> ] %s -> %s" % (canonical_name, full, email, old_author.affiliation, company))
old_author.affiliation = company
old_author.save(update_fields=["affiliation"])
updated = True
if country is None:
country = ""
if old_author.country != country:
say("new country: %s [ %s <%s> ] %s -> %s" % (canonical_name , full, email, old_author.country, country))
old_author.country = country
old_author.save(update_fields=["country"])
updated = True
if updates:
Document.objects.filter(pk=doc.pk).update(**updates)
updated = True
if updated:
say("updated: %s" % canonical_name)
stop = time.time()
dur = stop-start
sec = dur%60
min = dur//60
say("Processing time %d:%02d" % (min, sec))
print("\n\nWrote log to %s" % os.path.abspath(logfile.name))
logfile.close()