Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ MODEL_PATH = ''
EVALUATE_SCRIPT_PATH = 'evaluate/evaluate_model.py'
REGISTER_SCRIPT_PATH = 'register/register_model.py'
SOURCES_DIR_TRAIN = 'code'
DATASET_NAME = 'diabetes_ds'
DATASTORE_NAME = 'datablobstore'
DATAFILE_NAME = 'diabetes.csv'

# Optional. Used by a training pipeline with R on Databricks
DB_CLUSTER_ID = ''
Expand Down
4 changes: 3 additions & 1 deletion .pipelines/azdo-variables.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,6 @@ variables:
- name: DB_CLUSTER_ID
value: ''
- name: SCORE_SCRIPT
value: score.py
value: score.py
- name: DATASET_NAME
value: diabetes_ds
18 changes: 17 additions & 1 deletion code/training/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
POSSIBILITY OF SUCH DAMAGE.
"""
from azureml.core.run import Run
from azureml.core import Dataset
import os
import argparse
from sklearn.datasets import load_diabetes
Expand Down Expand Up @@ -69,19 +70,34 @@ def main():
"must be a positive float.")
)

parser.add_argument(
"--dataset_name",
type=str,
help=("Dataset with the training data")
)
args = parser.parse_args()

print("Argument [build_id]: %s" % args.build_id)
print("Argument [model_name]: %s" % args.model_name)
print("Argument [alpha]: %s" % args.alpha)
print("Argument [dataset_name]: %s" % args.dataset_name)

model_name = args.model_name
build_id = args.build_id
alpha = args.alpha
dataset_name = args.dataset_name

run = Run.get_context()
ws = run.experiment.workspace

if (dataset_name):
dataset = Dataset.get_by_name(workspace=ws, name=dataset_name)
df = dataset.to_pandas_dataframe()
X = df.values
y = df.Y
else:
X, y = load_diabetes(return_X_y=True)

X, y = load_diabetes(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=0)
data = {"train": {"X": X_train, "y": y_train},
Expand Down
4 changes: 4 additions & 0 deletions docs/getting_started.md
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,10 @@ Check out the newly created resources in the [Azure Portal](portal.azure.com):

(Optional) To remove the resources created for this project you can use the [/environment_setup/iac-remove-environment.yml](../environment_setup/iac-remove-environment.yml) definition or you can just delete the resource group in the [Azure Portal](portal.azure.com).

**Note:** The training ML pipeline uses a [sample diabetes dataset](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html) as training data. If you want to use your own dataset, you need to [create and register a datastore](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-access-data#azure-machine-learning-studio) in your ML workspace and upload the datafile (e.g. [diabetes.csv](./data/diabetes.csv)) to the corresponding blob container. You can also define a datastore in the ML Workspace with [az cli](https://docs.microsoft.com/en-us/cli/azure/ext/azure-cli-ml/ml/datastore?view=azure-cli-latest#ext-azure-cli-ml-az-ml-datastore-attach-blob).
You'll also need to configure DATASTORE_NAME and DATAFILE_NAME variables in ***devopsforai-aml-vg*** variable group.


## Create an Azure DevOps Azure ML Workspace Service Connection
Install the **Azure Machine Learning** extension to your organization from the
[marketplace](https://marketplace.visualstudio.com/items?itemName=ms-air-aiagility.vss-services-azureml),
Expand Down
17 changes: 15 additions & 2 deletions ml_service/pipelines/build_train_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from azureml.pipeline.core import Pipeline
from azureml.core import Workspace
from azureml.core.runconfig import RunConfiguration, CondaDependencies
from azureml.core import Dataset, Datastore
import os
import sys
sys.path.append(os.path.abspath("./ml_service/util")) # NOQA: E402
Expand Down Expand Up @@ -35,10 +36,10 @@ def main():
'scikit-learn', 'tensorflow', 'keras'],
pip_packages=['azure', 'azureml-core',
'azure-storage',
'azure-storage-blob'])
'azure-storage-blob',
'azureml-dataprep'])
)
run_config.environment.docker.enabled = True

config_envvar = {}
if (e.collection_uri is not None and e.teamproject_name is not None):
builduri_base = e.collection_uri + e.teamproject_name
Expand All @@ -53,6 +54,17 @@ def main():
hyperparameter_alpha_param = PipelineParameter(
name="hyperparameter_alpha", default_value=0.5)

dataset_name = ""
if (e.datastore_name is not None and e.datafile_name is not None):
dataset_name = e.dataset_name
datastore = Datastore.get(aml_workspace, e.datastore_name)
data_path = [(datastore, e.datafile_name)]
dataset = Dataset.Tabular.from_delimited_files(path=data_path)
dataset.register(workspace=aml_workspace,
name=e.dataset_name,
description="dataset with training data",
create_new_version=True)

train_step = PythonScriptStep(
name="Train Model",
script_name=e.train_script_path,
Expand All @@ -62,6 +74,7 @@ def main():
"--build_id", build_id_param,
"--model_name", model_name_param,
"--alpha", hyperparameter_alpha_param,
"--dataset_name", dataset_name,
],
runconfig=run_config,
allow_reuse=False,
Expand Down
15 changes: 15 additions & 0 deletions ml_service/util/env_variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ def __init__(self):
self._score_script = os.environ.get("SCORE_SCRIPT")
self._collection_uri = os.environ.get("SYSTEM_COLLECTIONURI")
self._teamproject_name = os.environ.get("SYSTEM_TEAMPROJECT")
self._datastore_name = os.environ.get("DATASTORE_NAME")
self._datafile_name = os.environ.get("DATAFILE_NAME")
self._dataset_name = os.environ.get("DATASET_NAME")

@property
def workspace_name(self):
Expand Down Expand Up @@ -145,3 +148,15 @@ def collection_uri(self):
@property
def teamproject_name(self):
return self._teamproject_name

@property
def datastore_name(self):
return self._datastore_name

@property
def datafile_name(self):
return self._datafile_name

@property
def dataset_name(self):
return self._dataset_name