Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 19 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,23 +77,40 @@ To start a new crawl, specify a crawl name, seed URLs, and the API via which URL
token = "SOME_TOKEN"
name = "sampleCrawlName"
seeds = "http://www.twitter.com/"
apiUrl = "analyze"
sampleCrawl = DiffbotCrawl(token,name,seeds,apiUrl)
api = "analyze"
sampleCrawl = DiffbotCrawl(token,name,seeds=seeds,api=api)
```

Omit "seeds" and "api" to load an existing crawl, or create a crawl as a placeholder.

To check the status of a crawl:

```
sampleCrawl.status()
```

To update a crawl:

```
maxToCrawl = 100
upp = "diffbot"
sampleCrawl.update(maxToCrawl=maxToCrawl,urlProcessPattern=upp)
```

To delete or restart a crawl:

```
sampleCrawl.delete()
sampleCrawl.restart()
```

To download crawl data:

```
sampleCrawl.download() # returns JSON by default
sampleCrawl.download(data_format="csv")
```

To pass additional arguments to a crawl:

```
Expand Down
33 changes: 28 additions & 5 deletions client.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def format_version_string(version_number):

class DiffbotJob(DiffbotClient):
"""
Various calls for managing a Diffbot Crawlbot or Bulk API job.
Various calls for managing a Crawlbot or Bulk API job.
"""

def request(self,params):
Expand All @@ -51,6 +51,12 @@ def status(self):
response = self.request(self.params)
return response

def update(self,**kwargs):
temp_params = self.params
temp_params.update(kwargs)
response = self.request(self.params)
return response

def delete(self):
temp_params = self.params
temp_params['delete'] = 1
Expand All @@ -63,19 +69,36 @@ def restart(self):
response = self.request(temp_params)
return response

def download(self,data_format="json"):
"""
downloads the JSON output of a crawl or bulk job
"""

download_url = '{}/v3/{}/download/{}-{}_data.{}'.format(
self.base_url,self.jobType,self.params['token'],self.params['name'],data_format
)
download = requests.get(download_url)
download.raise_for_status()
if data_format == "csv":
return download.content
else:
return download.json()

class DiffbotCrawl(DiffbotJob):
"""
Initializes a new Diffbot crawl. Pass additional arguments as necessary.
Initializes a Diffbot crawl. Pass additional arguments as necessary.
"""

def __init__(self,token,name,seeds,api,apiVersion=3,**kwargs):
def __init__(self,token,name,seeds=None,api=None,apiVersion=3,**kwargs):
self.params = {
"token": token,
"name": name,
}
startParams = dict(self.params)
startParams['seeds'] = seeds
startParams['apiUrl'] = self.compose_url(api,apiVersion)
if seeds:
startParams['seeds'] = seeds
if api:
startParams['apiUrl'] = self.compose_url(api,apiVersion)
startParams.update(kwargs)
self.jobType = "crawl"
self.start(startParams)
2 changes: 1 addition & 1 deletion example.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@
seeds = "http://support.diffbot.com"
api = "article"
name = "testCrawl"
diffbot = DiffbotCrawl(token,name,seeds,api)
diffbot = DiffbotCrawl(token, name, seeds=seeds, api=api)
time.sleep(5)
status = diffbot.status()
print "\nPrinting status:\n"
Expand Down