Copyright (c) Microsoft Corporation. All rights reserved. 

Licensed under the MIT License.

![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/azure-arcadia/Synapse_Job_Scala_Support.png)

## Get AML workspace which has synapse spark pool attached

In [None]:
from azureml.core import Workspace, Experiment, Dataset, Environment

ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

## Leverage ScriptRunConfig to submit scala job to an attached synapse spark cluster

### Prepare data

In [None]:
from azureml.core.datastore import Datastore
# Use the default blob storage
def_blob_store = Datastore(ws, "workspaceblobstore")

# We are uploading a sample file in the local directory to be used as a datasource
file_name = "shakespeare.txt"
def_blob_store.upload_files(files=["./{}".format(file_name)], overwrite=False)

# Create file dataset
file_dataset = Dataset.File.from_files(path=[(def_blob_store, file_name)])

In [None]:
from azureml.core.runconfig import RunConfiguration
from azureml.data import HDFSOutputDatasetConfig
import uuid

run_config = RunConfiguration(framework="pyspark")
run_config.target = "link-pool"
run_config.spark.configuration["spark.driver.memory"] = "2g"
run_config.spark.configuration["spark.driver.cores"] = 2
run_config.spark.configuration["spark.executor.memory"] = "2g"
run_config.spark.configuration["spark.executor.cores"] = 1
run_config.spark.configuration["spark.executor.instances"] = 1
# This can be removed if you are using local jars in source folder
run_config.spark.configuration["spark.yarn.dist.jars"]="wasbs://synapse@azuremlexamples.blob.core.windows.net/shared/wordcount.jar"

dir_name = "wordcount-{}".format(str(uuid.uuid4()))
input = file_dataset.as_named_input("input").as_hdfs()
output = HDFSOutputDatasetConfig(destination=(ws.get_default_datastore(), "{}/result".format(dir_name)))

from azureml.core import ScriptRunConfig
args = ['--input', input, '--output', output]
script_run_config = ScriptRunConfig(source_directory = '.',
 script= 'start_script.py',
 arguments= args,
 run_config = run_config)


In [None]:
from azureml.core import Experiment
exp = Experiment(workspace=ws, name='synapse-spark')
run = exp.submit(config=script_run_config)
run

## Leverage SynapseSparkStep in an AML pipeline to add dataprep step on synapse spark cluster

In [None]:
from azureml.pipeline.core import Pipeline
from azureml.pipeline.steps import SynapseSparkStep

configs = {}
#configs["spark.yarn.dist.jars"] = "wasbs://synapse@azuremlexamples.blob.core.windows.net/shared/wordcount.jar"
step_1 = SynapseSparkStep(name = 'synapse-spark',
 file = 'start_script.py',
 jars = "wasbs://synapse@azuremlexamples.blob.core.windows.net/shared/wordcount.jar",
 source_directory=".",
 arguments = args,
 compute_target = 'link-pool',
 driver_memory = "2g",
 driver_cores = 2,
 executor_memory = "2g",
 executor_cores = 1,
 num_executors = 1,
 conf = configs)

In [None]:
pipeline = Pipeline(workspace=ws, steps=[step_1])
pipeline_run = pipeline.submit('synapse-pipeline', regenerate_outputs=True)