Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ steps:

displayName: 'replace subscription value'

- script: 'python code/testing/data_test.py data/diabetes.csv && python code/testing/data_test.py data/diabetes_bad_dist.csv && python code/testing/data_test.py data/diabetes_bad_schema.csv && python code/testing/data_test.py data/diabetes_missing_values.csv'
- script: 'pytest tests/unit/data_test.py'
displayName: 'Data Quality Check'

- script: 'python aml_service/00-WorkSpace.py'
Expand Down
3 changes: 2 additions & 1 deletion environment_setup/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
scipy==1.0.0
scikit-learn==0.19.1
numpy==1.14.5
pandas==0.23.1
pandas==0.23.1
pytest==4.3.0
102 changes: 50 additions & 52 deletions code/testing/data_test.py → tests/unit/data_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,22 @@
ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
"""

import sys
import os
import numpy as np
import pandas as pd


# get absolute path of csv files from data folder
def get_absPath(filename):
"""Returns the path of the notebooks folder"""
path = os.path.abspath(
os.path.join(os.path.dirname(__file__), os.path.pardir,
os.path.pardir, "data", filename))
return path


# number of features
n_columns = 10
expected_columns = 10

# distribution of features in the training set
historical_mean = np.array(
Expand Down Expand Up @@ -65,60 +73,50 @@
]
)

# maximal relative change in feature mean or standrd deviation that we can tolerate
# maximal relative change in feature mean or standrd deviation
# that we can tolerate
shift_tolerance = 3


def check_schema(X):
n_actual_columns = X.shape[1]
if n_actual_columns != n_columns:
print(
"Error: found {} feature columns. The data should have {} feature columns.".format(
n_actual_columns, n_columns
)
)
return False

return True


def check_missing_values(dataset):
def test_check_schema():
datafile = get_absPath("diabetes.csv")
# check that file exists
assert(os.path.exists(datafile))
dataset = pd.read_csv(datafile)
header = dataset[dataset.columns[:-1]]
actual_columns = header.shape[1]
# check header has expected number of columns
assert(actual_columns == expected_columns)


def test_check_bad_schema():
datafile = get_absPath("diabetes_bad_schema.csv")
# check that file exists
assert(os.path.exists(datafile))
dataset = pd.read_csv(datafile)
header = dataset[dataset.columns[:-1]]
actual_columns = header.shape[1]
# check header has expected number of columns
assert(actual_columns != expected_columns)


def test_check_missing_values():
datafile = get_absPath("diabetes_missing_values.csv")
# check that file exists
assert(os.path.exists(datafile))
dataset = pd.read_csv(datafile)
n_nan = np.sum(np.isnan(dataset.values))
if n_nan > 0:
print("Warning: the data has {} missing values".format(n_nan))
return False
return True
assert(n_nan > 0)


def check_distribution(dataset):
def test_check_distribution():
datafile = get_absPath("diabetes_bad_dist.csv")
# check that file exists
assert(os.path.exists(datafile))
dataset = pd.read_csv(datafile)
mean = np.mean(dataset.values, axis=0)
std = np.mean(dataset.values, axis=0)
if (
np.sum(abs(mean - historical_mean) > shift_tolerance * abs(historical_mean)) > 0
or np.sum(abs(std - historical_std) > shift_tolerance * abs(historical_std)) > 0
):
print("Warning: new data has different distribution than the training data")
return False
return True


def main():
filename = sys.argv[1]
if not os.path.exists(filename):
print("Error: The file {} does not exist".format(filename))
return

dataset = pd.read_csv(filename)
if check_schema(dataset[dataset.columns[:-1]]):
print("Data schema test succeeded")
if check_missing_values(dataset) and check_distribution(dataset):
print("Missing values test passed")
print("Data distribution test passed")
else:
print(
"There might be some issues with the data. Please check warning messages."
)


if __name__ == "__main__":
main()
assert(np.sum(abs(mean - historical_mean) > shift_tolerance *
abs(historical_mean)) or
np.sum(abs(std - historical_std) > shift_tolerance *
abs(historical_std)) > 0)