yahoo
diff --git a/‎Makefile‎
Lines changed: 40 additions & 17 deletions b/‎Makefile‎
Lines changed: 40 additions & 17 deletions
diff --git a/‎caffe-grid/src/main/python/com/yahoo/ml/caffe/DisplayUtils.py‎
Lines changed: 25 additions & 1 deletion b/‎caffe-grid/src/main/python/com/yahoo/ml/caffe/DisplayUtils.py‎
Lines changed: 25 additions & 1 deletion
diff --git a/‎caffe-grid/src/main/python/examples/ImageCaption.py‎
Lines changed: 50 additions & 0 deletions b/‎caffe-grid/src/main/python/examples/ImageCaption.py‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎caffe-grid/src/main/python/examples/ImageCaptioning.ipynb‎
Lines changed: 156 additions & 0 deletions b/‎caffe-grid/src/main/python/examples/ImageCaptioning.ipynb‎
Lines changed: 156 additions & 0 deletions
diff --git a/‎caffe-grid/src/main/python/examples/README-coco.md‎
Lines changed: 119 additions & 0 deletions b/‎caffe-grid/src/main/python/examples/README-coco.md‎
Lines changed: 119 additions & 0 deletions
diff --git a/‎caffe-grid/src/main/python/examples/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎caffe-grid/src/main/python/examples/__init__.py‎
Lines changed: 3 additions & 0 deletions
@@ -1,40 +1,63 @@
+unexport PROJECT
 HOME ?=/home/${USER}
 ifeq ($(shell which spark-submit),)
-     SPARK_HOME ?=/home/y/share/spark
+     SPARK_HOME=~/spark-1.6.0-bin-hadoop2.6
 else
      SPARK_HOME ?=$(shell which spark-submit 2>&1 | sed 's/\/bin\/spark-submit//g')
 endif
-CAFFE_ON_SPARK ?=$(shell pwd)
-LD_LIBRARY_PATH ?=/home/y/lib64:/home/y/lib64/mkl/intel64
+CAFFE_ON_SPARK=/Users/mridul/bigml/CaffeOnSpark
+LD_LIBRARY_PATH ?=/usr/local/cuda/
 LD_LIBRARY_PATH2=${LD_LIBRARY_PATH}:${CAFFE_ON_SPARK}/caffe-public/distribute/lib:${CAFFE_ON_SPARK}/caffe-distri/distribute/lib:/usr/lib64:/lib64 
-DYLD_LIBRARY_PATH ?=/home/y/lib64:/home/y/lib64/mkl/intel64
-DYLD_LIBRARY_PATH2=${DYLD_LIBRARY_PATH}:${CAFFE_ON_SPARK}/caffe-public/distribute/lib:${CAFFE_ON_SPARK}/caffe-distri/distribute/lib:/usr/lib64:/lib64 
+DYLD_LIBRARY_PATH ?=/usr/local/cuda/lib
+DYLD_LIBRARY_PATH2=${DYLD_LIBRARY_PATH}:${CAFFE_ON_SPARK}/caffe-public/distribute/lib:${CAFFE_ON_SPARK}/caffe-distri/distribute/lib:/usr/lib64:/lib64
 
 export SPARK_VERSION=$(shell ${SPARK_HOME}/bin/spark-submit --version 2>&1 | grep version | awk '{print $$5}' | cut -d'.' -f1)
 ifeq (${SPARK_VERSION}, 2)
     export MVN_SPARK_FLAG=-Dspark2
 endif
 
-build:
+
+screwdriver: platforms package-release
+
+platforms:
+	git submodule init
+	git submodule update --force
+	git submodule foreach --recursive git clean -dfx
+	cd caffe-public; make proto; make -j4 -e distribute; cd ..
+	export LD_LIBRARY_PATH="${LD_LIBRARY_PATH2}"; mvn ${MVN_SPARK_FLAG} -B package -DskipTests
+	jar -xvf caffe-grid/target/caffe-grid-0.1-SNAPSHOT-jar-with-dependencies.jar META-INF/native/linux64/liblmdbjni.so
+	mv META-INF/native/linux64/liblmdbjni.so ${CAFFE_ON_SPARK}/caffe-distri/distribute/lib
+	export LD_LIBRARY_PATH="${LD_LIBRARY_PATH2}"; mvn ${MVN_SPARK_FLAG} -B test 
+	cd ${CAFFE_ON_SPARK}/caffe-grid/src/main/python/; zip -r caffeonsparkpythonapi  *; mv caffeonsparkpythonapi.zip ${CAFFE_ON_SPARK}/caffe-grid/target/; cd ${CAFFE_ON_SPARK}
+	export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}; export SPARK_HOME=${SPARK_HOME};${CAFFE_ON_SPARK}/caffe-grid/src/test/python/PythonTest.sh
+
+package-release:
+	pushd pkg; yinst_create --clean -r; popd
+	cp pkg/*.tgz $(AUTO_PUBLISH_DIR)
+
+testcoverageplatforms:
+
+build: 
 	cd caffe-public; make proto; make -j4 -e distribute; cd ..
 	export LD_LIBRARY_PATH="${LD_LIBRARY_PATH2}"; mvn ${MVN_SPARK_FLAG} -B package -DskipTests
 	jar -xvf caffe-grid/target/caffe-grid-0.1-SNAPSHOT-jar-with-dependencies.jar META-INF/native/linux64/liblmdbjni.so
 	mv META-INF/native/linux64/liblmdbjni.so ${CAFFE_ON_SPARK}/caffe-distri/distribute/lib
 	${CAFFE_ON_SPARK}/scripts/setup-mnist.sh
-	export LD_LIBRARY_PATH="${LD_LIBRARY_PATH2}"; mvn ${MVN_SPARK_FLAG} -B package
-	cp -r ${CAFFE_ON_SPARK}/caffe-public/python/caffe ${CAFFE_ON_SPARK}/caffe-grid/src/main/python/
-	cd ${CAFFE_ON_SPARK}/caffe-grid/src/main/python/; zip -r caffeonsparkpythonapi  *; mv caffeonsparkpythonapi.zip ${CAFFE_ON_SPARK}/caffe-grid/target/;cd ${CAFFE_ON_SPARK}
-	export LD_LIBRARY_PATH="${LD_LIBRARY_PATH2}"; export SPARK_HOME="${SPARK_HOME}"; ${CAFFE_ON_SPARK}/caffe-grid/src/test/python/PythonTest.sh
+	export LD_LIBRARY_PATH="${LD_LIBRARY_PATH2}"; mvn ${MVN_SPARK_FLAG} -B test
+	cd ${CAFFE_ON_SPARK}/caffe-grid/src/main/python/; zip -r caffeonsparkpythonapi  *; pushd ${CAFFE_ON_SPARK}/caffe-public/python/; zip -ur ${CAFFE_ON_SPARK}/caffe-grid/src/main/python/caffeonsparkpythonapi.zip *; popd; mv caffeonsparkpythonapi.zip ${CAFFE_ON_SPARK}/caffe-grid/target/; cd ${CAFFE_ON_SPARK}
+	export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}; export SPARK_HOME=${SPARK_HOME};${CAFFE_ON_SPARK}/caffe-grid/src/test/python/PythonTest.sh
+
 buildosx:
 	cd caffe-public; make proto; make -j4 -e distribute; cd ..
 	export DYLD_LIBRARY_PATH="${DYLD_LIBRARY_PATH2}"; mvn ${MVN_SPARK_FLAG} -B package -DskipTests
 	jar -xvf caffe-grid/target/caffe-grid-0.1-SNAPSHOT-jar-with-dependencies.jar META-INF/native/osx64/liblmdbjni.jnilib
 	mv META-INF/native/osx64/liblmdbjni.jnilib ${CAFFE_ON_SPARK}/caffe-distri/distribute/lib
 	${CAFFE_ON_SPARK}/scripts/setup-mnist.sh
-	export DYLD_LIBRARY_PATH="${DYLD_LIBRARY_PATH2}"; mvn ${MVN_SPARK_FLAG} -B package
-	cp -r ${CAFFE_ON_SPARK}/caffe-public/python/caffe ${CAFFE_ON_SPARK}/caffe-grid/src/main/python/
-	cd ${CAFFE_ON_SPARK}/caffe-grid/src/main/python/; zip -r caffeonsparkpythonapi  *; mv caffeonsparkpythonapi.zip ${CAFFE_ON_SPARK}/caffe-grid/target/; cd ${CAFFE_ON_SPARK}
-	export LD_LIBRARY_PATH="${LD_LIBRARY_PATH2}"; export SPARK_HOME="${SPARK_HOME}"; ${CAFFE_ON_SPARK}/caffe-grid/src/test/python/PythonTest.sh
+	export LD_LIBRARY_PATH="${DYLD_LIBRARY_PATH2}"; mvn ${MVN_SPARK_FLAG} -B test
+	cd ${CAFFE_ON_SPARK}/caffe-grid/src/main/python/; zip -r caffeonsparkpythonapi  *; pushd ${CAFFE_ON_SPARK}/caffe-public/python/; zip -ur ${CAFFE_ON_SPARK}/caffe-grid/src/main/python/caffeonsparkpythonapi.zip *; popd; mv caffeonsparkpythonapi.zip ${CAFFE_ON_SPARK}/caffe-grid/target/; cd ${CAFFE_ON_SPARK}
+	cd ${CAFFE_ON_SPARK}/caffe-grid/src/main/python/; zip -r caffeonsparkpythonapi *; mv caffeonsparkpythonapi.zip ${CAFFE_ON_SPARK}/caffe-grid/target/; cd ${CAFFE_ON_SPARK}
+	export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}; export SPARK_HOME=${SPARK_HOME};${CAFFE_ON_SPARK}/caffe-grid/src/test/python/PythonTest.sh
+
 update:
 	git submodule init
 	git submodule update --force
@@ -48,9 +71,9 @@ gh-pages:
 	rm -rf scala_doc
 	git checkout gh-pages scala_doc
 
-clean:
-	cd caffe-public; make clean; cd ..
-	cd caffe-distri; make clean; cd ..
+cleanplatforms: 
+	pushd caffe-public; make clean; popd
+	pushd caffe-distri; make clean; popd
 	mvn ${MVN_SPARK_FLAG} clean
 
 ALL: build
@@ -4,6 +4,7 @@
 import numpy as np
 from base64 import b64encode
 from google.protobuf import text_format
+import array
 
 import caffe
 import caffe.draw
@@ -12,6 +13,10 @@
 def get_np_array(row):
     return np.frombuffer(row.data, 'uint8').reshape((row.height,row.width))
 
+def get_image(image):
+    bytes = array.array('b', image)
+    return "<img src='data:image/png;base64," + b64encode(bytes) + "' />"
+
 def image_tag(np_array): 
     im = Image.fromarray(np_array, 'L')
     bytebuffer = BytesIO()
@@ -20,7 +25,6 @@ def image_tag(np_array):
 
 def show_df(df, nrows=10):
     """Displays a table of labels with their images, inline in html
-
         :param DataFrame df: A python dataframe
         :param int nrows: First n rows to display from the dataframe
     """
@@ -36,6 +40,26 @@ def show_df(df, nrows=10):
     html += "</table>"
     return HTML(html)
 
+def show_captions(df, nrows=10):
+    """Displays a table of captions(both original as well as predictions) with their images, inline in html
+
+        :param DataFrame df: A python dataframe
+        :param int nrows: First n rows to display from the dataframe
+    """
+    data = df.take(nrows)
+    html = "<table><tr><th>Image Id</th><th>Image</th><th>Prediction</th>"
+    for i in range(nrows):
+        row = data[i]
+        html += "<tr>"
+        html += "<td>%s</td>" % row.id
+        html += "<td>%s</td>" % get_image(row.data.image)
+        html += "<td>%s</td>" % row.prediction
+        html += "</tr>"
+    html += "</table>"
+    return HTML(html)
+
+
+
 def show_network(input_net_proto_file, rankdir):
     """Show the network graph in inline html, for the input prototxt file
 
 
@@ -0,0 +1,50 @@
+# Copyright 2016 Yahoo Inc.
+# Licensed under the terms of the Apache 2.0 license.
+# Please see LICENSE file in the project root for terms.
+import caffe
+from examples.coco.retrieval_experiment import *
+from pyspark.sql import SQLContext
+from pyspark import SparkConf,SparkContext
+from pyspark.sql.types import *
+from itertools import izip_longest
+import json
+import argparse
+
+def predict_caption(list_of_images, model, imagenet, lstmnet, vocab):
+  out_iterator = []
+  ce = CaptionExperiment(str(model),str(imagenet),str(lstmnet),str(vocab))
+  for image in list_of_images:
+    out_iterator.append(ce.getCaption(image))
+  return iter(out_iterator)
+
+def get_predictions(sqlContext, images, model, imagenet, lstmnet, vocab):
+  rdd = images.mapPartitions(lambda im: predict_caption(im, model, imagenet, lstmnet, vocab))
+  INNERSCHEMA = StructType([StructField("id", StringType(), True),StructField("prediction", StringType(), True)])
+  schema = StructType([StructField("result", INNERSCHEMA, True)])
+  return sqlContext.createDataFrame(rdd, schema).select("result.id", "result.prediction")
+
+def main():
+  conf = SparkConf()
+  sc = SparkContext(conf=conf)
+  sqlContext = SQLContext(sc)
+  cmdargs = conf.get('spark.pythonargs')
+  parser = argparse.ArgumentParser(description="Image to Caption Util")
+  parser.add_argument('-input', action="store", dest="input")
+  parser.add_argument('-model', action="store", dest="model")
+  parser.add_argument('-imagenet', action="store", dest="imagenet")
+  parser.add_argument('-lstmnet', action="store", dest="lstmnet")
+  parser.add_argument('-vocab', action="store", dest="vocab")
+  parser.add_argument('-output', action="store", dest="output")
+  
+  args=parser.parse_args(cmdargs.split(" "))
+
+  df_input = sqlContext.read.parquet(str(args.input))
+  images = df_input.select("data.image","data.height", "data.width", "id")
+  df=get_predictions(sqlContext, images, str(args.model), str(args.imagenet), str(args.lstmnet), str(args.vocab))
+  df.write.json(str(args.output))
+
+
+if __name__ == "__main__":
+    main()
+
+
@@ -0,0 +1,119 @@
+Steps to run the COCO dataset for Image Captioning
+==================================================
+##### (1) Env setup
+    export CAFFE_ON_SPARK=/Users/mridul/bigml/CaffeOnSpark
+    export DYLD_LIBRARY_PATH=${CAFFE_ON_SPARK}/caffe-public/distribute/lib:${CAFFE_ON_SPARK}/caffe-distri/distribute/lib
+    export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:/usr/local/cuda/lib:/usr/local/mkl/lib/intel64/
+    export LD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}
+    export SPARK_HOME=/Users/mridul/bigml/spark-1.6.0-bin-hadoop2.6
+    export PATH=${SPARK_HOME}/bin:${PATH}
+    export PYSPARK_PYTHON=Python2.7.10/bin/python
+    export PYTHONPATH=$PYTHONPATH:caffeonsparkpythonapi.zip:caffe_on_grid_archive/lib64:/usr/local/cuda-7.5/lib64
+    export LD_LIBRARY_PATH=Python2.7.10/lib:/usr/local/cuda/lib:caffe_on_grid_archive/lib64/mkl/intel64/:${LD_LIBRARY_PATH}
+    export DYLD_LIBRARY_PATH=Python2.7.10/lib:/usr/local/cuda/lib:caffe_on_grid_archive/lib64/mkl/intel64/:${LD_LIBRARY_PATH}
+    export IPYTHON_ROOT=~/Python2.7.10
+    unset SPARK_CONF_DIR
+
+##### (2) Download the coco dataset if required
+
+    mkdir -p /tmp/coco
+    pushd /tmp/coco
+    wget http://msvocds.blob.core.windows.net/coco2014/train2014.zip
+    wget http://msvocds.blob.core.windows.net/coco2014/val2014.zip
+    wget http://msvocds.blob.core.windows.net/coco2014/test2014.zip
+    wget http://msvocds.blob.core.windows.net/annotations-1-0-3/captions_train-val2014.zip
+    unzip *.zip
+    popd
+
+##### (3) Create the input dataframe from cocodataset
+    #-output the root directory for producing all the outputs
+    #-imageRoot the root input directory for all the images (required)
+    #-captionFile the input json which contains the image details and captions in coco format (check on mscoco.org)
+    #-outputFormat the format of the output file to produce the dataframe
+    #-imageCaptionDFDir the dataframe output dir name for images and their captions under -output, in json
+    #-vocabDir the vocabulary for the dataframe under -output in desired outputFormat
+    #-embeddingDFDir the dataframe output dir name for embedded images and their captions under -output in desired outputFormat
+   
+    pushd ${CAFFE_ON_SPARK}/data/
+    spark-submit --master ${MASTER_URL} --deploy-mode client \
+        --conf spark.executor.extraClassPath=${CAFFE_ON_SPARK}/caffe-grid/target/caffe-grid-0.1-SNAPSHOT-jar-with-dependencies.jar \
+	--conf --driver-class-path=${CAFFE_ON_SPARK}/caffe-grid/target/caffe-grid-0.1-SNAPSHOT-jar-with-dependencies.jar \
+        --class com.yahoo.ml.caffe.tools.CocoDataSetConverter  \
+        ${CAFFE_ON_SPARK}/caffe-grid/target/caffe-grid-0.1-SNAPSHOT-jar-with-dependencies.jar \
+        -output  /tmp/coco/parquet/ \
+        -imageRoot /tmp/coco/images/train2014/ \
+        -captionFile /tmp/coco/annotations/captions_train2014.json \
+        -outputFormat parquet \
+        -imageCaptionDFDir df_image_caption_train2014 \
+        -vocabDir vocab \
+	-vocabSize 8800 \
+        -embeddingDFDir df_embedded_train2014
+    popd
+
+##### (4) Train the image model
+    pushd ${CAFFE_ON_SPARK}/data/
+    spark-submit --master ${MASTER_URL} \
+        --files train_val.prototxt,solver.prototxt \
+        --conf spark.cores.max=${TOTAL_CORES} \
+        --conf spark.task.cpus=${CORES_PER_WORKER} \
+    	--conf spark.driver.extraLibraryPath="${DYLD_LIBRARY_PATH}" \
+	--conf spark.executorEnv.DYLD_LIBRARY_PATH="${DYLD_LIBRARY_PATH}" \
+        --class com.yahoo.ml.caffe.CaffeOnSpark  \
+	${CAFFE_ON_SPARK}/caffe-grid/target/caffe-grid-0.1-SNAPSHOT-jar-with-dependencies.jar \
+            -train \
+            -conf solver.prototxt \
+            -model /tmp/coco/bvlc_reference_caffenet.caffemodel \
+            -devices 1
+    hadoop fs -ls /tmp/coco/bvlc_reference_caffenet.caffemodel
+    popd
+##### (5) Train the lstm
+    pushd ${CAFFE_ON_SPARK}/data/
+    spark-submit --master ${MASTER_URL} \
+        --files lrcn_cos.prototxt,lrcn_solver.prototxt \
+	--conf spark.cores.max=${TOTAL_CORES} \
+        --conf spark.task.cpus=${CORES_PER_WORKER} \
+        --conf spark.driver.extraLibraryPath="${DYLD_LIBRARY_PATH}" \
+        --conf spark.executorEnv.DYLD_LIBRARY_PATH="${DYLD_LIBRARY_PATH}" \       
+        --class com.yahoo.ml.caffe.CaffeOnSpark  \
+  	  ${CAFFE_ON_SPARK}/caffe-grid/target/caffe-grid-0.1-SNAPSHOT-jar-with-dependencies.jar \
+	     -train \
+             -conf lrcn_solver.prototxt \
+             -devices 1 \
+             -resize \
+             -weights /tmp/coco/bvlc_reference_caffenet.caffemodel \
+             -model /tmp/coco/parquet/lrcn_coco.model
+    popd
+
+##### (6) Submit the data for inference
+    Note that the below files also need to be shipped as shown
+    #-model the image-lstm pretrained model to ship 
+    #-imagenet the image network definition
+    #-lstmnet the lstm network definition
+    #-vocab the vocabulary file (produced from above) for the given train set
+    #-input the input embedding produced above
+    #-output the path where to write the desired output
+
+    pushd ${CAFFE_ON_SPARK}/data/
+    ln -s ~/Python2.7.10 Python2.7.10
+    unzip ${CAFFE_ON_SPARK}/caffe-grid/target/caffeonsparkpythonapi.zip
+    cat /tmp/coco/parquet/vocab/part* > vocab.txt
+    rm -rf /tmp/coco/parquet/df_caption_results_train2014
+    spark-submit --master ${MASTER_URL} \
+    		 --conf spark.cores.max=${TOTAL_CORES} \
+    		 --conf spark.task.cpus=${CORES_PER_WORKER} \    
+    		 --conf spark.driver.extraLibraryPath="${DYLD_LIBRARY_PATH}:Python2.7.10/lib" \
+    		 --conf spark.executorEnv.LD_LIBRARY_PATH="${DYLD_LIBRARY_PATH}:Python2.7.10/lib" \
+    		 --conf spark.pythonargs="-model /tmp/coco/parquet/lrcn_coco.model -imagenet deploy.prototxt -lstmnet lrcn_word_to_preds.deploy.prototxt -vocab vocab.txt -input /tmp/coco/parquet/df_embedded_train2014 -output /tmp/coco/parquet/df_caption_results_train2014" examples/ImageCaption.py
+    popd
+##### (7) Launch IPython Notebook
+    export IPYTHON_OPTS="notebook --no-browser --ip=127.0.0.1"
+    pushd ${CAFFE_ON_SPARK}/data/
+    ln -s ~/Python2.7.10 Python2.7.10
+    unzip ${CAFFE_ON_SPARK}/caffe-grid/target/caffeonsparkpythonapi.zip
+    cat /tmp/coco/parquet/vocab/part* > vocab.txt
+    pyspark --master ${MASTER_URL} --deploy-mode client \    
+    	    --conf spark.driver.extraLibraryPath="${DYLD_LIBRARY_PATH}:Python2.7.10/lib" \
+	    --conf spark.executorEnv.LD_LIBRARY_PATH="${DYLD_LIBRARY_PATH}:Python2.7.10/lib" \
+	    --files "${CAFFE_ON_SPARK}/data/deploy.prototxt,${CAFFE_ON_SPARK}/data/vocab.txt,${CAFFE_ON_SPARK}/data/lrcn_word_to_preds.deploy.prototxt,${CAFFE_ON_SPARK}/data/caffe/_caffe.so" \
+	    --py-files "${CAFFE_ON_SPARK}/caffe-grid/target/caffeonsparkpythonapi.zip"
+    
@@ -0,0 +1,3 @@
+from coco import *
+
+__all__=["coco"]
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from coco import *`
	`2`	`+`
	`3`	`+__all__=["coco"]`