Skip to content

Commit 8e71bcc

Browse files
authored
Merge pull request #31 from wikimedia/utf8
Convert DB rows into strings
2 parents 147ccf7 + 48857e6 commit 8e71bcc

File tree

1 file changed

+41
-45
lines changed

1 file changed

+41
-45
lines changed

reports.py

Lines changed: 41 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,20 @@
55
from displayTable import *
66
import re
77

8+
def decode_tuple(tuple):
9+
"""
10+
Decodes binary strings in a result row
11+
@param tuple: Database row
12+
@type tuple: tuple
13+
@rtype: tuple
14+
"""
15+
result = ()
16+
for var in tuple:
17+
if isinstance(var, bytes):
18+
var = var.decode('utf8')
19+
result += (var,)
20+
return result
21+
822
class Reports:
923
def __init__( self, site, db, wiki, dry_run=False ):
1024
self.db = db
@@ -14,11 +28,19 @@ def __init__( self, site, db, wiki, dry_run=False ):
1428
self.site.requests['timeout'] = 120
1529
self.dry_run = dry_run
1630

31+
def query( self, sql ):
32+
"""
33+
Executes an SQL query and returns a generator producing rows
34+
"""
35+
cur = self.db.cursor()
36+
cur.execute( sql )
37+
for row in cur.fetchall():
38+
yield decode_tuple(row)
39+
1740
# Oldest edited articles
1841
# Run time on enwiki 5 hours 23 minutes as of 8 Sept 2015
1942
def forgotten_articles( self ):
2043
# Make the query
21-
cur = self.db.cursor()
2244
query = """SELECT SQL_SMALL_RESULT
2345
MAX(rev_timestamp) AS lastedit, COUNT(rev_id) AS editcount, page_title
2446
FROM revision,
@@ -44,12 +66,11 @@ def forgotten_articles( self ):
4466
WHERE rev_page=page_id
4567
GROUP BY page_id
4668
ORDER BY lastedit ASC"""
47-
cur.execute( query )
4869

4970
# Extract the data into a Python nested list
5071
content = []
5172
content.append( ['forgotten-articles-title', 'forgotten-articles-last-edited', 'forgotten-articles-editcount'] )
52-
for row in cur.fetchall() :
73+
for row in self.query( query ) :
5374
# A page name is being caught by the testwiki abuse filter - the following lets this run:
5475
if re.search('abuse_filter',row[2],re.IGNORECASE):
5576
continue
@@ -63,14 +84,12 @@ def forgotten_articles( self ):
6384
# Page count by namespace
6485
# Run time on enwiki 4 hours 8 minutes as of 8 Sept 2015
6586
def page_count_by_namespace( self ):
66-
cur = self.db.cursor()
6787
query = """SELECT page_namespace, COUNT(*) AS total, SUM(page_is_redirect) AS redirect FROM page
6888
GROUP BY page_namespace"""
69-
cur.execute( query )
7089

7190
content = []
7291
content.append( ['pagecount-namespace', 'pagecount-namespace-name', 'pagecount-total', 'pagecount-redirect', 'pagecount-non-redirect'] )
73-
for row in cur.fetchall():
92+
for row in self.query( query ):
7493
content.append( [ row[0], '{{subst:ns:' + str( row[0] ) + '}}', row[1], row[2], row[1]-row[2] ])
7594

7695
# Format the data as wikitext
@@ -80,17 +99,15 @@ def page_count_by_namespace( self ):
8099

81100
# Pages with most revisions
82101
def pages_with_most_revisions( self ):
83-
cur = self.db.cursor()
84102
query = """SELECT COUNT(*) AS revisions, rev_page, p.page_namespace, p.page_title FROM revision r
85103
LEFT JOIN ( SELECT page_id, page_title, page_namespace FROM page ) p ON r.rev_page = p.page_id
86104
GROUP BY rev_page
87105
ORDER BY revisions DESC
88106
LIMIT 1000"""
89-
cur.execute( query )
90107

91108
content = []
92109
content.append( ['pagerevisions-namespace', 'pagerevisions-title', 'pagerevisions-revisions'] )
93-
for row in cur.fetchall():
110+
for row in self.query( query ):
94111
content.append( [ row[2], self.linkify( row[3], row[2] ), row[0] ])
95112
# Format the data as wikitext
96113
text = display_report( self.wiki, content , 'pagerevisions-desc' )
@@ -101,7 +118,6 @@ def pages_with_most_revisions( self ):
101118
# Identify users who meet the criteria for being granted "autopatrolled" on the English Wikipedia but who don't already have it.
102119
# Author: Andrew Crawford (thparkth) <acrawford@laetabilis.com>
103120
def autopatrol_eligibles( self ):
104-
cur = self.db.cursor()
105121
query = """ SELECT
106122
# "editor" consisting of username, wrapped in HTML tags linking to the sigma "created" tool
107123
CONCAT (
@@ -150,11 +166,10 @@ def autopatrol_eligibles( self ):
150166
HAVING created_count > 24
151167
ORDER BY created_count DESC
152168
LIMIT 500"""
153-
cur.execute( query )
154169

155170
content = []
156171
content.append( ['autopatrol-username', 'autopatrol-listlink', 'autopatrol-articles'] )
157-
for row in cur.fetchall():
172+
for row in self.query( query ):
158173
if row[1] is None:
159174
continue
160175
content.append( [ row[0], row[1], row[2] ] )
@@ -164,20 +179,18 @@ def autopatrol_eligibles( self ):
164179

165180

166181
def new_wikiprojects( self ):
167-
cur = self.db.cursor()
168182
query = """SELECT rc_timestamp, rc_title
169183
FROM recentchanges
170184
WHERE rc_new = 1
171185
AND rc_namespace = 4
172186
AND rc_title LIKE 'WikiProject%'
173187
AND rc_title NOT LIKE '%/%'
174188
ORDER BY rc_timestamp DESC
175-
""";
176-
cur.execute( query )
189+
"""
177190

178191
content = []
179192
content.append( [ 'newwp-date', 'newwp-title' ] )
180-
for row in cur.fetchall():
193+
for row in self.query( query ):
181194
if row[1] is None:
182195
continue
183196
content.append( [ row[0], self.linkify( row[1], 4 ) ] )
@@ -187,7 +200,6 @@ def new_wikiprojects( self ):
187200

188201

189202
def talk_pages_by_size( self ):
190-
cur = self.db.cursor()
191203
query = """SELECT page_namespace,
192204
REPLACE( SUBSTRING_INDEX(page_title, '/', 1 ), '_', ' ' ) AS parent,
193205
SUM( page_len ) / 1024 / 1024 AS total_size
@@ -196,11 +208,10 @@ def talk_pages_by_size( self ):
196208
GROUP BY page_namespace, parent
197209
ORDER BY total_size DESC
198210
LIMIT 300"""
199-
cur.execute( query )
200211

201212
content = []
202213
content.append( ['tpbs-namespace', 'tpbs-page', 'tpbs-size'] )
203-
for row in cur.fetchall():
214+
for row in self.query( query ):
204215
content.append( [ row[0], self.linkify( row[1], row[0] ), row[2] ] )
205216

206217
# Format the data as wikitext
@@ -209,7 +220,6 @@ def talk_pages_by_size( self ):
209220

210221

211222
def unused_file_redirects( self ):
212-
cur = self.db.cursor()
213223
query = """SELECT page_title,
214224
( SELECT COUNT(*)
215225
FROM imagelinks
@@ -225,11 +235,10 @@ def unused_file_redirects( self ):
225235
AND page_is_redirect = 1
226236
HAVING imagelinks + links <= 1
227237
"""
228-
cur.execute( query )
229238

230239
content = []
231240
content.append( ['ufr-page', 'ufr-imagelinks', 'ufr-links'] )
232-
for row in cur.fetchall():
241+
for row in self.query( query ):
233242
content.append( [ self.linkify( row[0], 6 ), row[1], row[2] ] )
234243

235244
# Format the data as wikitext
@@ -238,7 +247,6 @@ def unused_file_redirects( self ):
238247

239248

240249
def oldest_active( self ):
241-
cur = self.db.cursor()
242250
query = """SELECT SQL_SMALL_RESULT
243251
CONCAT( '[[User:',user_name,'|',user_name,']]' ) AS user_name,
244252
user_registration,
@@ -258,12 +266,11 @@ def oldest_active( self ):
258266
) AS InnerQuery
259267
ORDER BY user_registration
260268
LIMIT 200"""
261-
cur.execute( query )
262269

263270
content = []
264271
content.append( ['oldestactive-username', 'oldestactive-creationdate', 'oldestactive-editcount'] )
265-
for row in cur.fetchall():
266-
content.append( [ row[0], row[1] , row[2] ] );
272+
for row in self.query( query ):
273+
content.append( [ row[0], row[1] , row[2] ] )
267274

268275
# Format the data as wikitext
269276
text = display_report( self.wiki, content, 'oldestactive-desc' )
@@ -291,7 +298,6 @@ def deleted_prods( self ):
291298
AND log_namespace=0
292299
GROUP BY page_id
293300
LIMIT 500"""
294-
cur.execute( query )
295301
content = []
296302
content.append(
297303
['deletedprods-title',
@@ -301,8 +307,8 @@ def deleted_prods( self ):
301307
'deletedprods-delcomments'
302308
]
303309
)
304-
for row in cur.fetchall():
305-
logtext = row[4];
310+
for row in self.query( query ):
311+
logtext = row[4]
306312
logtext = re.sub("{","<nowiki>{</nowiki>",logtext)
307313
logtext = re.sub("}","<nowiki>}</nowiki>",logtext)
308314
content.append( [ self.linkify( row[0] ), row[1], datetime.datetime.strptime( row[2],'%Y%m%d%H%M%S'), datetime.datetime.strptime( row[3],'%Y%m%d%H%M%S'), logtext ] )
@@ -312,25 +318,22 @@ def deleted_prods( self ):
312318
self.publish_report( 'deletedprods-page-title', text )
313319

314320
def most_used_templates( self ):
315-
cur = self.db.cursor()
316321
query = """SELECT tl_title, COUNT(*)
317322
FROM templatelinks
318323
WHERE tl_namespace = 10
319324
GROUP BY tl_title
320325
ORDER BY COUNT(*) DESC
321326
LIMIT 3000"""
322-
cur.execute( query )
323327
content = []
324328
content.append( ['mostusedtemplate-title', 'mostusedtemplate-count'] )
325-
for row in cur.fetchall():
329+
for row in self.query( query ):
326330
content.append( [ self.linkify( row[0], 10 ), row[1] ] )
327331

328332
# Format the data as wikitext
329333
text = display_report( self.wiki, content, 'mostusedtemplate-desc' )
330334
self.publish_report( 'mostusedtemplate-page-title', text )
331335

332336
def unused_templates( self ):
333-
cur = self.db.cursor()
334337
query = """SELECT page_title
335338
FROM page
336339
LEFT JOIN categorylinks
@@ -346,18 +349,16 @@ def unused_templates( self ):
346349
AND tl_from IS NULL
347350
AND cl_from IS NULL
348351
LIMIT 2000"""
349-
cur.execute( query )
350352
content = []
351353
content.append( ['unusedtemplate-title'] )
352-
for row in cur.fetchall():
354+
for row in self.query( query ):
353355
content.append( [ self.linkify( row[0], 10 ) ] )
354356

355357
# Format the data as wikitext
356358
text = display_report( self.wiki, content, 'unusedtemplate-desc' )
357359
self.publish_report( 'unusedtemplate-page-title', text )
358360

359361
def orphaned_talk( self ):
360-
cur = self.db.cursor()
361362
query = """SELECT page_namespace, page_id, page_title, page_is_redirect, page_len
362363
FROM page talkpage
363364
WHERE talkpage.page_title NOT LIKE '%/%'
@@ -375,16 +376,15 @@ def orphaned_talk( self ):
375376
AND tl_title='G8-exempt'
376377
)
377378
LIMIT 1000"""
378-
cur.execute( query )
379379
content = []
380380
content.append( ["orphantalk-count", "orphantalk-isredirect", "orphantalk-namespace", "orphantalk-itemtitle", "orphantalk-pagesize"] )
381381
namespace_translate={ 1:"Talk", 5:"Wikipedia_Talk", 11:"Template_Talk", 15:"Category_Talk", 101:"Portal_Talk", 119:"Draft_Talk" }
382382
count = 1
383383

384-
for row in cur.fetchall():
384+
for row in self.query( query ):
385385
namespace = namespace_translate[row[0]]
386386
fulllink = namespace+':' + row[2]
387-
pagesize = round( row[4]/1024, 1 );
387+
pagesize = round( row[4]/1024, 1 )
388388
if row[3] == 0:
389389
redirect_label = " "
390390
else:
@@ -398,18 +398,16 @@ def orphaned_talk( self ):
398398

399399
def most_edited_page_last_month( self ):
400400
# Make the query
401-
cur = self.db.cursor()
402401
query = """SELECT rc_title, count(*) as num_edits
403402
FROM recentchanges
404403
WHERE rc_namespace = 0 AND rc_type < 2
405404
GROUP BY 1 ORDER BY 2 DESC
406405
LIMIT 25;"""
407-
cur.execute( query )
408406

409407
# Extract the data into a Python nested list
410408
content = []
411409
content.append( ['most_edited_page_last_month-title', 'most_edited_page_last_month-editcount'] )
412-
for row in cur.fetchall() :
410+
for row in self.query( query ):
413411
content.append( [ self.linkify( row[0] ), row[1] ] )
414412

415413
# Format the data as wikitext
@@ -418,7 +416,6 @@ def most_edited_page_last_month( self ):
418416

419417
# Longest articles (NS=0)
420418
def article_by_size( self ):
421-
cur = self.db.cursor()
422419
query = """SELECT
423420
page_namespace,
424421
page_title,
@@ -429,10 +426,9 @@ def article_by_size( self ):
429426
AND page_title NOT LIKE "%/%"
430427
ORDER BY page_len DESC
431428
LIMIT 1000;"""
432-
cur.execute( query )
433429
content = []
434430
content.append( ['article_by_size-namespace', 'article_by_size-title', 'article_by_size-size'] )
435-
for row in cur.fetchall():
431+
for row in self.query( query ):
436432
content.append( [ row[0], self.linkify( row[1], row[0] ), row[2] ])
437433

438434
# Format the data as wikitext

0 commit comments

Comments
 (0)