Created
February 22, 2015 20:18
-
-
Save alexandrutodor/164308c23a6d2bfa61a1 to your computer and use it in GitHub Desktop.
Shell script to download the latest Wikipedia dumps for a specific language
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # Usage: sh downloadDumps.sh "wikicode" | |
| # files will be placed in $HOME/workspace/dump/"wikicode"wiki | |
| # initiallizing parameters | |
| wiki="$1wiki" | |
| # urlRoot is set to the url containing the latest wikipedia dumps | |
| # of the specified language wiki | |
| urlRoot="http://dumps.wikimedia.org/$wiki/latest/" | |
| # initializing an array containing the names of the files we want to download | |
| names[0]="$wiki-latest-pages-articles.xml.bz2" | |
| names[1]="$wiki-latest-imagelinks.sql.gz" | |
| names[2]="$wiki-latest-image.sql.gz" | |
| names[3]="$wiki-latest-langlinks.sql.gz" | |
| names[4]="$wiki-latest-templatelinks.sql.gz" | |
| # initializing an array of urls where the files are located | |
| for i in "${!names[@]}" | |
| do | |
| files[$i]="$urlRoot${names[$i]}" | |
| done | |
| # initializing the folder path where the downloaded files will be places | |
| path="$HOME/workspace/dump/$wiki" | |
| # making sure the folder exists, otherwise it will be created | |
| if ![ -f "$path" ] | |
| then | |
| mkdir -p path | |
| fi | |
| cd $path | |
| # checking if the files allready exist in the folder | |
| # if they allready exist they will be deleted and downloaded again | |
| for i in "${names[@]}" | |
| do | |
| if [ -f "$i" ] | |
| then | |
| rm $i | |
| fi | |
| done | |
| # downloading the specified files in the $path folder | |
| for i in "${files[@]}" | |
| do | |
| echo "Downloading $i:" | |
| wget "$i" | |
| done | |
| # extracting the files in the #path folder | |
| # yes I know I could do it with less lines of code | |
| echo "Extracting file ${names[0]} , this will take several minutes" | |
| bzip2 -d ${names[0]} | |
| i=1 | |
| while [ $i -lt ${#names[@]} ] | |
| do | |
| echo "Extracting files ${names[$i]}" | |
| gzip -d ${names[$i]} | |
| let i=i+1 | |
| done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
2 issues in this code:
Line 33 -> if ! [ -f "$path" ] # space after !
Line 35 -> mkdir -p $path # $ is missing
Otherwise - thanks!