Skip to content
This repository was archived by the owner on Nov 16, 2019. It is now read-only.

Commit 33401bd

Browse files
author
Mridul Jain
committed
Distributed LSTM
1 parent 081407c commit 33401bd

File tree

19 files changed

+2226
-20
lines changed

19 files changed

+2226
-20
lines changed

Makefile

Lines changed: 40 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,63 @@
1+
unexport PROJECT
12
HOME ?=/home/${USER}
23
ifeq ($(shell which spark-submit),)
3-
SPARK_HOME ?=/home/y/share/spark
4+
SPARK_HOME=~/spark-1.6.0-bin-hadoop2.6
45
else
56
SPARK_HOME ?=$(shell which spark-submit 2>&1 | sed 's/\/bin\/spark-submit//g')
67
endif
7-
CAFFE_ON_SPARK ?=$(shell pwd)
8-
LD_LIBRARY_PATH ?=/home/y/lib64:/home/y/lib64/mkl/intel64
8+
CAFFE_ON_SPARK=/Users/mridul/bigml/CaffeOnSpark
9+
LD_LIBRARY_PATH ?=/usr/local/cuda/
910
LD_LIBRARY_PATH2=${LD_LIBRARY_PATH}:${CAFFE_ON_SPARK}/caffe-public/distribute/lib:${CAFFE_ON_SPARK}/caffe-distri/distribute/lib:/usr/lib64:/lib64
10-
DYLD_LIBRARY_PATH ?=/home/y/lib64:/home/y/lib64/mkl/intel64
11-
DYLD_LIBRARY_PATH2=${DYLD_LIBRARY_PATH}:${CAFFE_ON_SPARK}/caffe-public/distribute/lib:${CAFFE_ON_SPARK}/caffe-distri/distribute/lib:/usr/lib64:/lib64
11+
DYLD_LIBRARY_PATH ?=/usr/local/cuda/lib
12+
DYLD_LIBRARY_PATH2=${DYLD_LIBRARY_PATH}:${CAFFE_ON_SPARK}/caffe-public/distribute/lib:${CAFFE_ON_SPARK}/caffe-distri/distribute/lib:/usr/lib64:/lib64
1213

1314
export SPARK_VERSION=$(shell ${SPARK_HOME}/bin/spark-submit --version 2>&1 | grep version | awk '{print $$5}' | cut -d'.' -f1)
1415
ifeq (${SPARK_VERSION}, 2)
1516
export MVN_SPARK_FLAG=-Dspark2
1617
endif
1718

18-
build:
19+
20+
screwdriver: platforms package-release
21+
22+
platforms:
23+
git submodule init
24+
git submodule update --force
25+
git submodule foreach --recursive git clean -dfx
26+
cd caffe-public; make proto; make -j4 -e distribute; cd ..
27+
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH2}"; mvn ${MVN_SPARK_FLAG} -B package -DskipTests
28+
jar -xvf caffe-grid/target/caffe-grid-0.1-SNAPSHOT-jar-with-dependencies.jar META-INF/native/linux64/liblmdbjni.so
29+
mv META-INF/native/linux64/liblmdbjni.so ${CAFFE_ON_SPARK}/caffe-distri/distribute/lib
30+
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH2}"; mvn ${MVN_SPARK_FLAG} -B test
31+
cd ${CAFFE_ON_SPARK}/caffe-grid/src/main/python/; zip -r caffeonsparkpythonapi *; mv caffeonsparkpythonapi.zip ${CAFFE_ON_SPARK}/caffe-grid/target/; cd ${CAFFE_ON_SPARK}
32+
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}; export SPARK_HOME=${SPARK_HOME};${CAFFE_ON_SPARK}/caffe-grid/src/test/python/PythonTest.sh
33+
34+
package-release:
35+
pushd pkg; yinst_create --clean -r; popd
36+
cp pkg/*.tgz $(AUTO_PUBLISH_DIR)
37+
38+
testcoverageplatforms:
39+
40+
build:
1941
cd caffe-public; make proto; make -j4 -e distribute; cd ..
2042
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH2}"; mvn ${MVN_SPARK_FLAG} -B package -DskipTests
2143
jar -xvf caffe-grid/target/caffe-grid-0.1-SNAPSHOT-jar-with-dependencies.jar META-INF/native/linux64/liblmdbjni.so
2244
mv META-INF/native/linux64/liblmdbjni.so ${CAFFE_ON_SPARK}/caffe-distri/distribute/lib
2345
${CAFFE_ON_SPARK}/scripts/setup-mnist.sh
24-
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH2}"; mvn ${MVN_SPARK_FLAG} -B package
25-
cp -r ${CAFFE_ON_SPARK}/caffe-public/python/caffe ${CAFFE_ON_SPARK}/caffe-grid/src/main/python/
26-
cd ${CAFFE_ON_SPARK}/caffe-grid/src/main/python/; zip -r caffeonsparkpythonapi *; mv caffeonsparkpythonapi.zip ${CAFFE_ON_SPARK}/caffe-grid/target/;cd ${CAFFE_ON_SPARK}
27-
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH2}"; export SPARK_HOME="${SPARK_HOME}"; ${CAFFE_ON_SPARK}/caffe-grid/src/test/python/PythonTest.sh
46+
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH2}"; mvn ${MVN_SPARK_FLAG} -B test
47+
cd ${CAFFE_ON_SPARK}/caffe-grid/src/main/python/; zip -r caffeonsparkpythonapi *; pushd ${CAFFE_ON_SPARK}/caffe-public/python/; zip -ur ${CAFFE_ON_SPARK}/caffe-grid/src/main/python/caffeonsparkpythonapi.zip *; popd; mv caffeonsparkpythonapi.zip ${CAFFE_ON_SPARK}/caffe-grid/target/; cd ${CAFFE_ON_SPARK}
48+
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}; export SPARK_HOME=${SPARK_HOME};${CAFFE_ON_SPARK}/caffe-grid/src/test/python/PythonTest.sh
49+
2850
buildosx:
2951
cd caffe-public; make proto; make -j4 -e distribute; cd ..
3052
export DYLD_LIBRARY_PATH="${DYLD_LIBRARY_PATH2}"; mvn ${MVN_SPARK_FLAG} -B package -DskipTests
3153
jar -xvf caffe-grid/target/caffe-grid-0.1-SNAPSHOT-jar-with-dependencies.jar META-INF/native/osx64/liblmdbjni.jnilib
3254
mv META-INF/native/osx64/liblmdbjni.jnilib ${CAFFE_ON_SPARK}/caffe-distri/distribute/lib
3355
${CAFFE_ON_SPARK}/scripts/setup-mnist.sh
34-
export DYLD_LIBRARY_PATH="${DYLD_LIBRARY_PATH2}"; mvn ${MVN_SPARK_FLAG} -B package
35-
cp -r ${CAFFE_ON_SPARK}/caffe-public/python/caffe ${CAFFE_ON_SPARK}/caffe-grid/src/main/python/
36-
cd ${CAFFE_ON_SPARK}/caffe-grid/src/main/python/; zip -r caffeonsparkpythonapi *; mv caffeonsparkpythonapi.zip ${CAFFE_ON_SPARK}/caffe-grid/target/; cd ${CAFFE_ON_SPARK}
37-
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH2}"; export SPARK_HOME="${SPARK_HOME}"; ${CAFFE_ON_SPARK}/caffe-grid/src/test/python/PythonTest.sh
56+
export LD_LIBRARY_PATH="${DYLD_LIBRARY_PATH2}"; mvn ${MVN_SPARK_FLAG} -B test
57+
cd ${CAFFE_ON_SPARK}/caffe-grid/src/main/python/; zip -r caffeonsparkpythonapi *; pushd ${CAFFE_ON_SPARK}/caffe-public/python/; zip -ur ${CAFFE_ON_SPARK}/caffe-grid/src/main/python/caffeonsparkpythonapi.zip *; popd; mv caffeonsparkpythonapi.zip ${CAFFE_ON_SPARK}/caffe-grid/target/; cd ${CAFFE_ON_SPARK}
58+
cd ${CAFFE_ON_SPARK}/caffe-grid/src/main/python/; zip -r caffeonsparkpythonapi *; mv caffeonsparkpythonapi.zip ${CAFFE_ON_SPARK}/caffe-grid/target/; cd ${CAFFE_ON_SPARK}
59+
export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}; export SPARK_HOME=${SPARK_HOME};${CAFFE_ON_SPARK}/caffe-grid/src/test/python/PythonTest.sh
60+
3861
update:
3962
git submodule init
4063
git submodule update --force
@@ -48,9 +71,9 @@ gh-pages:
4871
rm -rf scala_doc
4972
git checkout gh-pages scala_doc
5073

51-
clean:
52-
cd caffe-public; make clean; cd ..
53-
cd caffe-distri; make clean; cd ..
74+
cleanplatforms:
75+
pushd caffe-public; make clean; popd
76+
pushd caffe-distri; make clean; popd
5477
mvn ${MVN_SPARK_FLAG} clean
5578

5679
ALL: build

caffe-grid/src/main/python/com/yahoo/ml/caffe/DisplayUtils.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import numpy as np
55
from base64 import b64encode
66
from google.protobuf import text_format
7+
import array
78

89
import caffe
910
import caffe.draw
@@ -12,6 +13,10 @@
1213
def get_np_array(row):
1314
return np.frombuffer(row.data, 'uint8').reshape((row.height,row.width))
1415

16+
def get_image(image):
17+
bytes = array.array('b', image)
18+
return "<img src='data:image/png;base64," + b64encode(bytes) + "' />"
19+
1520
def image_tag(np_array):
1621
im = Image.fromarray(np_array, 'L')
1722
bytebuffer = BytesIO()
@@ -20,7 +25,6 @@ def image_tag(np_array):
2025

2126
def show_df(df, nrows=10):
2227
"""Displays a table of labels with their images, inline in html
23-
2428
:param DataFrame df: A python dataframe
2529
:param int nrows: First n rows to display from the dataframe
2630
"""
@@ -36,6 +40,26 @@ def show_df(df, nrows=10):
3640
html += "</table>"
3741
return HTML(html)
3842

43+
def show_captions(df, nrows=10):
44+
"""Displays a table of captions(both original as well as predictions) with their images, inline in html
45+
46+
:param DataFrame df: A python dataframe
47+
:param int nrows: First n rows to display from the dataframe
48+
"""
49+
data = df.take(nrows)
50+
html = "<table><tr><th>Image Id</th><th>Image</th><th>Prediction</th>"
51+
for i in range(nrows):
52+
row = data[i]
53+
html += "<tr>"
54+
html += "<td>%s</td>" % row.id
55+
html += "<td>%s</td>" % get_image(row.data.image)
56+
html += "<td>%s</td>" % row.prediction
57+
html += "</tr>"
58+
html += "</table>"
59+
return HTML(html)
60+
61+
62+
3963
def show_network(input_net_proto_file, rankdir):
4064
"""Show the network graph in inline html, for the input prototxt file
4165
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# Copyright 2016 Yahoo Inc.
2+
# Licensed under the terms of the Apache 2.0 license.
3+
# Please see LICENSE file in the project root for terms.
4+
import caffe
5+
from examples.coco.retrieval_experiment import *
6+
from pyspark.sql import SQLContext
7+
from pyspark import SparkConf,SparkContext
8+
from pyspark.sql.types import *
9+
from itertools import izip_longest
10+
import json
11+
import argparse
12+
13+
def predict_caption(list_of_images, model, imagenet, lstmnet, vocab):
14+
out_iterator = []
15+
ce = CaptionExperiment(str(model),str(imagenet),str(lstmnet),str(vocab))
16+
for image in list_of_images:
17+
out_iterator.append(ce.getCaption(image))
18+
return iter(out_iterator)
19+
20+
def get_predictions(sqlContext, images, model, imagenet, lstmnet, vocab):
21+
rdd = images.mapPartitions(lambda im: predict_caption(im, model, imagenet, lstmnet, vocab))
22+
INNERSCHEMA = StructType([StructField("id", StringType(), True),StructField("prediction", StringType(), True)])
23+
schema = StructType([StructField("result", INNERSCHEMA, True)])
24+
return sqlContext.createDataFrame(rdd, schema).select("result.id", "result.prediction")
25+
26+
def main():
27+
conf = SparkConf()
28+
sc = SparkContext(conf=conf)
29+
sqlContext = SQLContext(sc)
30+
cmdargs = conf.get('spark.pythonargs')
31+
parser = argparse.ArgumentParser(description="Image to Caption Util")
32+
parser.add_argument('-input', action="store", dest="input")
33+
parser.add_argument('-model', action="store", dest="model")
34+
parser.add_argument('-imagenet', action="store", dest="imagenet")
35+
parser.add_argument('-lstmnet', action="store", dest="lstmnet")
36+
parser.add_argument('-vocab', action="store", dest="vocab")
37+
parser.add_argument('-output', action="store", dest="output")
38+
39+
args=parser.parse_args(cmdargs.split(" "))
40+
41+
df_input = sqlContext.read.parquet(str(args.input))
42+
images = df_input.select("data.image","data.height", "data.width", "id")
43+
df=get_predictions(sqlContext, images, str(args.model), str(args.imagenet), str(args.lstmnet), str(args.vocab))
44+
df.write.json(str(args.output))
45+
46+
47+
if __name__ == "__main__":
48+
main()
49+
50+

caffe-grid/src/main/python/examples/ImageCaptioning.ipynb

Lines changed: 156 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
Steps to run the COCO dataset for Image Captioning
2+
==================================================
3+
##### (1) Env setup
4+
export CAFFE_ON_SPARK=/Users/mridul/bigml/CaffeOnSpark
5+
export DYLD_LIBRARY_PATH=${CAFFE_ON_SPARK}/caffe-public/distribute/lib:${CAFFE_ON_SPARK}/caffe-distri/distribute/lib
6+
export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:/usr/local/cuda/lib:/usr/local/mkl/lib/intel64/
7+
export LD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}
8+
export SPARK_HOME=/Users/mridul/bigml/spark-1.6.0-bin-hadoop2.6
9+
export PATH=${SPARK_HOME}/bin:${PATH}
10+
export PYSPARK_PYTHON=Python2.7.10/bin/python
11+
export PYTHONPATH=$PYTHONPATH:caffeonsparkpythonapi.zip:caffe_on_grid_archive/lib64:/usr/local/cuda-7.5/lib64
12+
export LD_LIBRARY_PATH=Python2.7.10/lib:/usr/local/cuda/lib:caffe_on_grid_archive/lib64/mkl/intel64/:${LD_LIBRARY_PATH}
13+
export DYLD_LIBRARY_PATH=Python2.7.10/lib:/usr/local/cuda/lib:caffe_on_grid_archive/lib64/mkl/intel64/:${LD_LIBRARY_PATH}
14+
export IPYTHON_ROOT=~/Python2.7.10
15+
unset SPARK_CONF_DIR
16+
17+
##### (2) Download the coco dataset if required
18+
19+
mkdir -p /tmp/coco
20+
pushd /tmp/coco
21+
wget http://msvocds.blob.core.windows.net/coco2014/train2014.zip
22+
wget http://msvocds.blob.core.windows.net/coco2014/val2014.zip
23+
wget http://msvocds.blob.core.windows.net/coco2014/test2014.zip
24+
wget http://msvocds.blob.core.windows.net/annotations-1-0-3/captions_train-val2014.zip
25+
unzip *.zip
26+
popd
27+
28+
##### (3) Create the input dataframe from cocodataset
29+
#-output the root directory for producing all the outputs
30+
#-imageRoot the root input directory for all the images (required)
31+
#-captionFile the input json which contains the image details and captions in coco format (check on mscoco.org)
32+
#-outputFormat the format of the output file to produce the dataframe
33+
#-imageCaptionDFDir the dataframe output dir name for images and their captions under -output, in json
34+
#-vocabDir the vocabulary for the dataframe under -output in desired outputFormat
35+
#-embeddingDFDir the dataframe output dir name for embedded images and their captions under -output in desired outputFormat
36+
37+
pushd ${CAFFE_ON_SPARK}/data/
38+
spark-submit --master ${MASTER_URL} --deploy-mode client \
39+
--conf spark.executor.extraClassPath=${CAFFE_ON_SPARK}/caffe-grid/target/caffe-grid-0.1-SNAPSHOT-jar-with-dependencies.jar \
40+
--conf --driver-class-path=${CAFFE_ON_SPARK}/caffe-grid/target/caffe-grid-0.1-SNAPSHOT-jar-with-dependencies.jar \
41+
--class com.yahoo.ml.caffe.tools.CocoDataSetConverter \
42+
${CAFFE_ON_SPARK}/caffe-grid/target/caffe-grid-0.1-SNAPSHOT-jar-with-dependencies.jar \
43+
-output /tmp/coco/parquet/ \
44+
-imageRoot /tmp/coco/images/train2014/ \
45+
-captionFile /tmp/coco/annotations/captions_train2014.json \
46+
-outputFormat parquet \
47+
-imageCaptionDFDir df_image_caption_train2014 \
48+
-vocabDir vocab \
49+
-vocabSize 8800 \
50+
-embeddingDFDir df_embedded_train2014
51+
popd
52+
53+
##### (4) Train the image model
54+
pushd ${CAFFE_ON_SPARK}/data/
55+
spark-submit --master ${MASTER_URL} \
56+
--files train_val.prototxt,solver.prototxt \
57+
--conf spark.cores.max=${TOTAL_CORES} \
58+
--conf spark.task.cpus=${CORES_PER_WORKER} \
59+
--conf spark.driver.extraLibraryPath="${DYLD_LIBRARY_PATH}" \
60+
--conf spark.executorEnv.DYLD_LIBRARY_PATH="${DYLD_LIBRARY_PATH}" \
61+
--class com.yahoo.ml.caffe.CaffeOnSpark \
62+
${CAFFE_ON_SPARK}/caffe-grid/target/caffe-grid-0.1-SNAPSHOT-jar-with-dependencies.jar \
63+
-train \
64+
-conf solver.prototxt \
65+
-model /tmp/coco/bvlc_reference_caffenet.caffemodel \
66+
-devices 1
67+
hadoop fs -ls /tmp/coco/bvlc_reference_caffenet.caffemodel
68+
popd
69+
##### (5) Train the lstm
70+
pushd ${CAFFE_ON_SPARK}/data/
71+
spark-submit --master ${MASTER_URL} \
72+
--files lrcn_cos.prototxt,lrcn_solver.prototxt \
73+
--conf spark.cores.max=${TOTAL_CORES} \
74+
--conf spark.task.cpus=${CORES_PER_WORKER} \
75+
--conf spark.driver.extraLibraryPath="${DYLD_LIBRARY_PATH}" \
76+
--conf spark.executorEnv.DYLD_LIBRARY_PATH="${DYLD_LIBRARY_PATH}" \
77+
--class com.yahoo.ml.caffe.CaffeOnSpark \
78+
${CAFFE_ON_SPARK}/caffe-grid/target/caffe-grid-0.1-SNAPSHOT-jar-with-dependencies.jar \
79+
-train \
80+
-conf lrcn_solver.prototxt \
81+
-devices 1 \
82+
-resize \
83+
-weights /tmp/coco/bvlc_reference_caffenet.caffemodel \
84+
-model /tmp/coco/parquet/lrcn_coco.model
85+
popd
86+
87+
##### (6) Submit the data for inference
88+
Note that the below files also need to be shipped as shown
89+
#-model the image-lstm pretrained model to ship
90+
#-imagenet the image network definition
91+
#-lstmnet the lstm network definition
92+
#-vocab the vocabulary file (produced from above) for the given train set
93+
#-input the input embedding produced above
94+
#-output the path where to write the desired output
95+
96+
pushd ${CAFFE_ON_SPARK}/data/
97+
ln -s ~/Python2.7.10 Python2.7.10
98+
unzip ${CAFFE_ON_SPARK}/caffe-grid/target/caffeonsparkpythonapi.zip
99+
cat /tmp/coco/parquet/vocab/part* > vocab.txt
100+
rm -rf /tmp/coco/parquet/df_caption_results_train2014
101+
spark-submit --master ${MASTER_URL} \
102+
--conf spark.cores.max=${TOTAL_CORES} \
103+
--conf spark.task.cpus=${CORES_PER_WORKER} \
104+
--conf spark.driver.extraLibraryPath="${DYLD_LIBRARY_PATH}:Python2.7.10/lib" \
105+
--conf spark.executorEnv.LD_LIBRARY_PATH="${DYLD_LIBRARY_PATH}:Python2.7.10/lib" \
106+
--conf spark.pythonargs="-model /tmp/coco/parquet/lrcn_coco.model -imagenet deploy.prototxt -lstmnet lrcn_word_to_preds.deploy.prototxt -vocab vocab.txt -input /tmp/coco/parquet/df_embedded_train2014 -output /tmp/coco/parquet/df_caption_results_train2014" examples/ImageCaption.py
107+
popd
108+
##### (7) Launch IPython Notebook
109+
export IPYTHON_OPTS="notebook --no-browser --ip=127.0.0.1"
110+
pushd ${CAFFE_ON_SPARK}/data/
111+
ln -s ~/Python2.7.10 Python2.7.10
112+
unzip ${CAFFE_ON_SPARK}/caffe-grid/target/caffeonsparkpythonapi.zip
113+
cat /tmp/coco/parquet/vocab/part* > vocab.txt
114+
pyspark --master ${MASTER_URL} --deploy-mode client \
115+
--conf spark.driver.extraLibraryPath="${DYLD_LIBRARY_PATH}:Python2.7.10/lib" \
116+
--conf spark.executorEnv.LD_LIBRARY_PATH="${DYLD_LIBRARY_PATH}:Python2.7.10/lib" \
117+
--files "${CAFFE_ON_SPARK}/data/deploy.prototxt,${CAFFE_ON_SPARK}/data/vocab.txt,${CAFFE_ON_SPARK}/data/lrcn_word_to_preds.deploy.prototxt,${CAFFE_ON_SPARK}/data/caffe/_caffe.so" \
118+
--py-files "${CAFFE_ON_SPARK}/caffe-grid/target/caffeonsparkpythonapi.zip"
119+
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from coco import *
2+
3+
__all__=["coco"]

0 commit comments

Comments
 (0)