Skip to content

Commit 9987c17

Browse files
committed
initial checkin
0 parents  commit 9987c17

File tree

7 files changed

+184
-0
lines changed

7 files changed

+184
-0
lines changed

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Python client library for Google Refine
2+
3+
This allows you to script Refine by creating projects from data files, applying extracted JSON operation histories against the data and then exporting the transformed data back out of Refine.
4+
5+
Work in progress! More docs soon

dates.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Date
2+
7 December 2001
3+
July 1 2002
4+
10/20/10

operations.json

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
[
2+
{
3+
"op": "core/text-transform",
4+
"description": "Text transform on cells in column Date using expression grel:value.toDate()",
5+
"engineConfig": {
6+
"facets": [],
7+
"mode": "row-based"
8+
},
9+
"columnName": "Date",
10+
"expression": "grel:value.toDate()",
11+
"onError": "set-to-blank",
12+
"repeat": false,
13+
"repeatCount": 10
14+
},
15+
{
16+
"op": "core/text-transform",
17+
"description": "Text transform on cells in column Date using expression grel:value.datePart(\"year\")+1",
18+
"engineConfig": {
19+
"facets": [],
20+
"mode": "row-based"
21+
},
22+
"columnName": "Date",
23+
"expression": "grel:value.datePart(\"year\")+1",
24+
"onError": "set-to-blank",
25+
"repeat": false,
26+
"repeatCount": 10
27+
}
28+
]

refine.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
# requires installation of urllib2_file from https://github.com/seisen/urllib2_file/#readme
2+
3+
import urllib2_file
4+
import urllib2, urlparse, os.path, time, json
5+
6+
class Refine:
7+
def __init__(self, server='http://127.0.0.1:3333'):
8+
self.server = server[0,-1] if server.endswith('/') else server
9+
10+
def new_project(self, file_path, options=None):
11+
file_name = os.path.split(file_path)[-1]
12+
project_name = options['project_name'] if options != None and 'project_name' in options else file_name
13+
data = {
14+
'project-file' : {
15+
'fd' : open(file_path),
16+
'filename' : file_name
17+
},
18+
'project-name' : project_name
19+
}
20+
21+
response = urllib2.urlopen(self.server + '/command/core/create-project-from-upload', data)
22+
response.read()
23+
url_params = urlparse.parse_qs(urlparse.urlparse(response.geturl()).query)
24+
if 'project' in url_params:
25+
id = url_params['project'][0]
26+
return RefineProject(self.server, id, project_name)
27+
28+
# TODO: better error reporting
29+
return None
30+
31+
class RefineProject:
32+
def __init__(self, server, id, project_name):
33+
self.server = server
34+
self.id = id
35+
self.project_name = project_name
36+
37+
def wait_until_idle(self, polling_delay=0.5):
38+
while True:
39+
response = urllib2.urlopen(self.server + '/command/core/get-processes?project=' + self.id)
40+
response_json = json.loads(response.read())
41+
if 'processes' in response_json and len(response_json['processes']) > 0:
42+
time.sleep(polling_delay)
43+
else:
44+
return
45+
46+
def apply_operations(self, file_path, wait=True):
47+
fd = open(file_path)
48+
operations_json = fd.read()
49+
50+
data = {
51+
'operations' : operations_json
52+
}
53+
response = urllib2.urlopen(self.server + '/command/core/apply-operations?project=' + self.id, data)
54+
response_json = json.loads(response.read())
55+
if response_json['code'] == 'error':
56+
raise Exception(response_json['message'])
57+
elif response_json['code'] == 'pending':
58+
if wait:
59+
self.wait_until_idle()
60+
return 'ok'
61+
62+
return response_json['code'] # can be 'ok' or 'pending'
63+
64+
def export_rows(self, format='tsv'):
65+
data = {
66+
'engine' : '{"facets":[],"mode":"row-based"}',
67+
'project' : self.id,
68+
'format' : format
69+
}
70+
response = urllib2.urlopen(self.server + '/command/core/export-rows/' + self.project_name + '.' + format, data)
71+
return response.read()
72+
73+
def delete_project(self):
74+
data = {
75+
'project' : self.id
76+
}
77+
response = urllib2.urlopen(self.server + '/command/core/delete-project', data)
78+
response_json = json.loads(response.read())
79+
return 'code' in response_json and response_json['code'] == 'ok'

refine.pyc

3.59 KB
Binary file not shown.

refineapi.md

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# Refine API
2+
3+
When uploading files you will need to send the data as `multipart/form-data`, e.g.:
4+
5+
Content-Disposition: form-data; name="project-file"; filename="operations.json"
6+
7+
Content-Disposition: form-data; name="project-name"
8+
9+
myproject
10+
11+
The other operations are just normal POST parameters
12+
13+
## Create project:
14+
15+
POST /command/core/create-project-from-upload
16+
17+
multipart form-data:
18+
19+
'project-file' : file contents...
20+
'project-name' : project name...
21+
22+
Returns new project ID and other metadata
23+
24+
## Apply operations
25+
26+
POST /command/core/apply-operations
27+
28+
multipart form-data:
29+
30+
'project' : project id...
31+
'operations' : file contents...
32+
33+
Returns JSON response
34+
35+
## Export rows
36+
37+
POST /command/core/export-rows
38+
39+
'engine' : JSON string... (e.g. '{"facets":[],"mode":"row-based"}')
40+
'project' : project id...
41+
'format' : format... (e.g 'tsv', 'csv')
42+
43+
Returns exported row data
44+
45+
## Delete project
46+
47+
POST /command/core/delete-project
48+
49+
'project' : project id...
50+
51+
Returns JSON response
52+
53+
## Check status of async processes
54+
55+
POST /command/core/get-processes
56+
57+
'project' : project id...
58+
59+
Returns JSON response

test.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
import sys
2+
sys.path.append("refine.py")
3+
import refine
4+
5+
r = refine.Refine()
6+
p = r.new_project("dates.txt")
7+
p.apply_operations("operations.json")
8+
print p.export_rows()
9+
p.delete_project()

0 commit comments

Comments
 (0)