pythonlearn/old/old-code/wikigrade.py at master · StudyCourse/pythonlearn

History

76 lines (65 loc) · 2.34 KB

Raw

import string

import sqlite3

import urllib

import xml.etree.ElementTree as ET

from BeautifulSoup import *

conn = sqlite3.connect('wikidata.db')

cur = conn.cursor()

cur.execute('''

CREATE TABLE IF NOT EXISTS TinyTable (id INTEGER PRIMARY KEY,

url TEXT, page BLOB, retrieved_at timestamp)''')

# A slightly extended dictionary

class sash(dict):

def sortvalues(self,reverse=True):

return sorted(self.items(),key=lambda x: (x[1], x[0]), reverse=reverse)

def tinyTable(url):

global cur,conn

cur.execute('SELECT id,page,retrieved_at FROM TinyTable WHERE URL = ?', (url, ))

try:

row = cur.fetchone()

print 'DATE',row[2]

return row[1]

except:

row = None

print 'Retrieving', url

data = urllib.urlopen (url).read()

if row != None:

cur.execute("UPDATE TinyTable SET page=?,retrieved_at=datetime('now') WHERE id=?", (unicode(data, 'utf-8'), row[0]))

else:

cur.execute("INSERT INTO TinyTable (url, page, retrieved_at) VALUES (?, ?, datetime('now'))",(url, unicode(data, 'utf-8')))

conn.commit()

return data

cururl = 'https://ctools.umich.edu/portal/tool/27500dea-c105-4f7b-a195-3c89536a64b7?pageName=%2Fsite%2Ff57681b8-6db9-46cf-aad1-3a0bdd621138%2Fhome&action=view&panel=Main&realm=%2Fsite%2Ff57681b8-6db9-46cf-aad1-3a0bdd621138'

prefix = 'https://ctools.umich.edu/portal/tool/27500dea-c105-4f7b-a195-3c89536a64b7'

urls = list()

urls.append(cururl)

visited = list()

editcounts = sash()

postcounts = sash()

while len(urls) > 0 :

print '=== URLS Yet To Retrieve:',len(urls)

cururl = urls.pop()

if cururl in visited: continue

print 'RETRIEVING',cururl

data = tinyTable(cururl)

visited.append(cururl)

soup = BeautifulSoup(data)

tags = soup('a')

# print 'Tags'

for tag in tags:

print tag

url = tag.get('href',None)

if url == None : continue

# Don't follow absolute urls

if not url.startswith(prefix) : continue

newurl = urllib.basejoin(cururl,url)

if newurl in visited : continue

# print 'APPENDING',newurl

if newurl.find('action=view') > 0 or newurl.find('action=history') > 0 :

urls.append(newurl)

print 'EDITS:'

for (key,val) in editcounts.sortvalues():

print key, val

for (key,val) in sorted(postcounts.items()):

print key, val

conn.close()

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

wikigrade.py

Latest commit

History

wikigrade.py

File metadata and controls