microsoft · praneet22 · Feb 21, 2019 · Feb 21, 2019 · Feb 21, 2019 · Feb 21, 2019
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -28,7 +28,7 @@ steps:
 
   displayName: 'replace subscription value'
 
-- script: 'python code/testing/data_test.py data/diabetes.csv && python code/testing/data_test.py data/diabetes_bad_dist.csv && python code/testing/data_test.py data/diabetes_bad_schema.csv && python code/testing/data_test.py data/diabetes_missing_values.csv'
+- script: 'pytest tests/unit/data_test.py'
   displayName: 'Data Quality Check'
 
 - script: 'python aml_service/00-WorkSpace.py'

diff --git a/environment_setup/requirements.txt b/environment_setup/requirements.txt
@@ -1,4 +1,5 @@
 scipy==1.0.0
 scikit-learn==0.19.1
 numpy==1.14.5
-pandas==0.23.1
+pandas==0.23.1
+pytest==4.3.0
diff --git a/code/testing/data_test.py → tests/unit/data_test.py b/code/testing/data_test.py → tests/unit/data_test.py
@@ -24,14 +24,22 @@
 ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGE.
 """
-
-import sys
 import os
 import numpy as np
 import pandas as pd
 
+
+# get absolute path of csv files from data folder
+def get_absPath(filename):
+    """Returns the path of the notebooks folder"""
+    path = os.path.abspath(
+        os.path.join(os.path.dirname(__file__), os.path.pardir, 
+                     os.path.pardir, "data", filename))
+    return path
+
+
 # number of features
-n_columns = 10
+expected_columns = 10
 
 # distribution of features in the training set
 historical_mean = np.array(
@@ -65,60 +73,50 @@
     ]
 )
 
-# maximal relative change in feature mean or standrd deviation that we can tolerate
+# maximal relative change in feature mean or standrd deviation
+# that we can tolerate
 shift_tolerance = 3
 
 
-def check_schema(X):
-    n_actual_columns = X.shape[1]
-    if n_actual_columns != n_columns:
-        print(
-            "Error: found {} feature columns. The data should have {} feature columns.".format(
-                n_actual_columns, n_columns
-            )
-        )
-        return False
-
-    return True
-
-
-def check_missing_values(dataset):
+def test_check_schema():
+    datafile = get_absPath("diabetes.csv")
+    # check that file exists
+    assert(os.path.exists(datafile))
+    dataset = pd.read_csv(datafile)
+    header = dataset[dataset.columns[:-1]]
+    actual_columns = header.shape[1]
+    # check header has expected number of columns
+    assert(actual_columns == expected_columns)
+
+
+def test_check_bad_schema():
+    datafile = get_absPath("diabetes_bad_schema.csv")
+    # check that file exists
+    assert(os.path.exists(datafile))
+    dataset = pd.read_csv(datafile)
+    header = dataset[dataset.columns[:-1]]
+    actual_columns = header.shape[1]
+    # check header has expected number of columns
+    assert(actual_columns != expected_columns)
+
+
+def test_check_missing_values():
+    datafile = get_absPath("diabetes_missing_values.csv")
+    # check that file exists
+    assert(os.path.exists(datafile))
+    dataset = pd.read_csv(datafile)
     n_nan = np.sum(np.isnan(dataset.values))
-    if n_nan > 0:
-        print("Warning: the data has {} missing values".format(n_nan))
-        return False
-    return True
+    assert(n_nan > 0)
 
 
-def check_distribution(dataset):
+def test_check_distribution():
+    datafile = get_absPath("diabetes_bad_dist.csv")
+    # check that file exists
+    assert(os.path.exists(datafile))
+    dataset = pd.read_csv(datafile)
     mean = np.mean(dataset.values, axis=0)
     std = np.mean(dataset.values, axis=0)
-    if (
-        np.sum(abs(mean - historical_mean) > shift_tolerance * abs(historical_mean)) > 0
-        or np.sum(abs(std - historical_std) > shift_tolerance * abs(historical_std)) > 0
-    ):
-        print("Warning: new data has different distribution than the training data")
-        return False
-    return True
-
-
-def main():
-    filename = sys.argv[1]
-    if not os.path.exists(filename):
-        print("Error: The file {} does not exist".format(filename))
-        return
-
-    dataset = pd.read_csv(filename)
-    if check_schema(dataset[dataset.columns[:-1]]):
-        print("Data schema test succeeded")
-        if check_missing_values(dataset) and check_distribution(dataset):
-            print("Missing values test passed")
-            print("Data distribution test passed")
-        else:
-            print(
-                "There might be some issues with the data. Please check warning messages."
-            )
-
-
-if __name__ == "__main__":
-    main()
+    assert(np.sum(abs(mean - historical_mean) > shift_tolerance *
+                  abs(historical_mean)) or
+           np.sum(abs(std - historical_std) > shift_tolerance * 
+                  abs(historical_std)) > 0)