Skip to content
This repository has been archived by the owner on Oct 23, 2022. It is now read-only.

AML cloud support #71

Merged
merged 11 commits into from
Apr 7, 2021
11 changes: 11 additions & 0 deletions .amlignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
docs/
test/
.git/
.github/
.vscode/
*.md
LICENSE
Manifest.toml
.gitignore
src/Cloud/amlconf
outputs/
2 changes: 1 addition & 1 deletion .github/workflows/Documentation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
- uses: actions/checkout@v2
- uses: julia-actions/setup-julia@latest
with:
version: 1.5
version: 1.6
- name: Install dependencies
run: |
xvfb-run julia --project=docs/ -e '
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,6 @@ docs/site/
# committed for packages, but should be committed for applications that require a static
# environment.
Manifest.toml

# Cloud config file
src/Cloud/amlconf
3 changes: 3 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
ColorSchemes = "35d6a980-a343-548e-a6ea-1d62b119f2f4"
ColorTypes = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
Conda = "8f4d0f93-b110-5947-807f-2305c1781a2d"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
Expand All @@ -30,6 +31,8 @@ PackageCompiler = "9b87118b-4619-50d2-8e1e-99f35a4d4d9d"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
Polynomials = "f27b6e38-b328-58d1-80ce-0feddd5e7a45"
PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
Expand Down
5 changes: 2 additions & 3 deletions deps/build.jl
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ const GLASSCAT_DIR = joinpath(@__DIR__, "..", "src", "GlassCat") # contains Glas
const JL_DIR = joinpath(GLASSCAT_DIR, "data") # contains AGFGlasscat.jl, SCHOTT.jl, etc.

const SOURCES_PATH = joinpath(@__DIR__, "sources.txt")
const AGFGLASSCAT_PATH = joinpath(JL_DIR, "AGFGlassCat.jl")
const AGFGLASSCAT_NAME = "AGFGlassCat.jl"

include(joinpath(GLASSCAT_DIR, "GlassTypes.jl"))
include("sources.jl")
Expand All @@ -40,6 +40,5 @@ verify_sources!(sources, AGF_DIR)
verified_source_names = [source[1] for source in sources]

# Use verified sources to generate required .jl files
@info "$(isfile(AGFGLASSCAT_PATH) ? "Re-g" : "G")enerating $AGFGLASSCAT_PATH"
@info "Using sources: $(join(verified_source_names, ", ", " and "))"
generate_jls(verified_source_names, AGFGLASSCAT_PATH, JL_DIR, AGF_DIR)
generate_jls(verified_source_names, AGFGLASSCAT_NAME, JL_DIR, AGF_DIR)
8 changes: 4 additions & 4 deletions deps/generate.jl
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ function generate_jls(

# parse the catalog into a module string and write it to a catalog file (.jl)
id, modstring = catalog_to_modstring(id, catalogname, catalog)
push!(catalogfiles, joinpath(jldir, "$(catalogname).jl"))
open(catalogfiles[end], "w") do io
push!(catalogfiles, "$(catalogname).jl")
open(joinpath(jldir, catalogfiles[end]), "w") do io
write(io, modstring)
end

Expand All @@ -59,13 +59,13 @@ function generate_jls(
agfstrings = [
"export $(join(sourcenames, ", "))",
"",
["include(raw\"$(catalogfile)\")" for catalogfile in catalogfiles]...,
["include(\"$(catalogfile)\")" for catalogfile in catalogfiles]...,
"",
"const AGF_GLASS_NAMES = [$(join(repr.(glassnames), ", "))]",
"const AGF_GLASSES = [$(join(glassnames, ", "))]",
""
]
open(mainfile, "w") do io
open(joinpath(jldir, mainfile), "w") do io
write(io, join(agfstrings, "\n"))
end
end
Expand Down
6 changes: 3 additions & 3 deletions docs/make.jl
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,7 @@ makedocs(
modules = [OpticSim],
pages = [
"Home" => "index.md",
"Examples" => "examples.md",
# "Glasses" => cat_pages,
"Examples" => "examples.md",
"Geometry" => [
"Basic Types" => "basic_types.md",
"Primitives" => "primitives.md",
Expand All @@ -48,6 +47,7 @@ makedocs(
"Visualization" => "vis.md",
"Glass Functions" => "glasscat.md",
"Optimization" => "optimization.md",
"Cloud Execution" => "cloud.md",
"Reference" => "ref.md",
"Roadmap" => "roadmap.md"
],
Expand All @@ -56,12 +56,12 @@ makedocs(
deploydocs(
repo = "github.com/microsoft/OpticSim.jl.git",
devbranch = "main",
push_preview = true,
)

# function children(m::Module)
# ns = names(m, imported = false, all = true)
# ms = []

# for n in ns
# try
# x = Core.eval(m, n)
Expand Down
54 changes: 54 additions & 0 deletions docs/src/cloud.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Cloud Execution

## Azure

A key benefit and design motivation of OpticSim is being able to execute many simulations/optimizations at once.
This is best enabled through the use of cloud computing services such as Azure.

As part of the base package, we provide support for cloud execution using an [Azure Machine Learning](https://azure.microsoft.com/en-gb/free/machine-learning) workspace.

To use this functionally you'll first need to set up an AML workspace with a compute cluster. Then you'll need to provide a few bits of information to OpticSim:

- Subscription ID, of the form `XXXXXXXX-XXX-XXX-XXXX-XXXXXXXXXXXX`
- Resource group name
- Workspace name
- Compute cluster name

This information can be cached either to a specific file, or globally:

```@docs
OpticSim.Cloud.cache_run_config
OpticSim.Cloud.get_cached_run_config
```

You should also include an `.amlignore` file in the root of your project.
This is similar to a `.gitignore` file and should include any files which should not be uploaded to AML as part of your source snapshot, for examples `test/`.

!!! note
**`Manifest.toml` must be listed in your `.amlignore` file.**

If an `.amlignore` doesn't already exist then one will be created on the first submission of a run to AML.

Once everything is configured, you can submit a run:

```@docs
OpticSim.Cloud.submit_run_to_AML
```

To retrieve outputs from your run simply write files to the `outputs/` directory and the files will automatically appear as part of the AML run.

### Examples

```julia
using OpticSim.Cloud

cache_run_config([subscription_id], [resource_group_name], [workspace_name], [compute_name], [path_to_config])

submit_run_to_AML("example-run", [path_to_script], ["--arg1", "1", "--arg2", "2"], nothing, [path_to_config])

submit_run_to_AML("example-hyperdrive-run", [path_to_script], ["--arg1", "1"], Dict("--arg2" => ["1", "2", "3"]), [path_to_config])
```

## Other Cloud Services

Currently no other services are supported, though it should be reasonably straightforward to add similar functionality to that for AML.
179 changes: 179 additions & 0 deletions src/Cloud/Cloud.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
module Cloud

using PyCall
using Conda
using Pkg
using Random

"""
cache_run_config(subscription_id::String, resource_group::String, workspace_name::String, compute_name::String[, path::String])

Writes the AML config information to a file at `path`. If `path` isn't set then the config will be used globally for that OpticSim install.
"""
function cache_run_config(subscription_id::String, resource_group::String, workspace_name::String, compute_name::String, path::String = joinpath(@__DIR__, "amlconf"))
open(path, "w") do io
write(io, subscription_id * "\n")
write(io, resource_group * "\n")
write(io, workspace_name * "\n")
write(io, compute_name)
end
nothing
end

"""
get_cached_run_config([path::String])

Reads the AML config information from a file at `path`. If not specified then the global config will be read.
"""
function get_cached_run_config(path::String = joinpath(@__DIR__, "amlconf"))
open(path, "r") do io
subscription_id = readline(io)
resource_group = readline(io)
workspace_name = readline(io)
compute_name = readline(io)
return subscription_id, resource_group, workspace_name, compute_name
end
end

"""
submit_run_to_AML(run_name::String, path_to_script::String, script_args::Vector{String} = nothing, sampled_args:Dict{String,Vector{String}} = nothing, config_path::String; hyperdrive_concurrent_runs::Int = 10)
submit_run_to_AML(run_name::String, path_to_script::String, subscription_id::String, resource_group::String, workspace_name::String, compute_name::String, script_args::Vector{String} = nothing, sampled_args::Dict{String, Vector{String}} = nothing; hyperdrive_concurrent_runs::Int = 10)

Submit a run to AML, `path_to_script` is relative to your local package root (i.e. location of `Project.toml`).
`script_args` are a series of arguments to your script as strings.
`sampled_args` is a dictionary where keys are argument names and values are lists of values (as strings) that that argument will take.
`config_path` is a path to a config file as written by [`cache_run_config`](@ref), if not specified the global config is used. Alternatively this information can be provided directly using the second method above.
`hyperdrive_concurrent_runs` is the maximum number of concurrent runs that will execute on AML (limited by your compute cluster size).
"""
function submit_run_to_AML(run_name::String, path_to_script::String, script_args::Union{Nothing,Vector{String}} = nothing,
sampled_args::Union{Nothing,Dict{String,Vector{String}}} = nothing;
config_path::String = joinpath(@__DIR__, "amlconf"),
hyperdrive_concurrent_runs::Int = 10)
subscription_id, resource_group, workspace_name, compute_name = get_cached_run_config(config_path)
submit_run_to_AML(run_name, path_to_script, subscription_id, resource_group, workspace_name, compute_name,
script_args, sampled_args, hyperdrive_concurrent_runs=hyperdrive_concurrent_runs)
end

function submit_run_to_AML(run_name::String, path_to_script::String,
subscription_id::String, resource_group::String, workspace_name::String, compute_name::String,
script_args::Union{Nothing,Vector{String}} = nothing,
sampled_args::Union{Nothing,Dict{String,Vector{String}}} = nothing;
hyperdrive_concurrent_runs::Int = 10)

dockerfile = open(joinpath(@__DIR__, "dockerfile")) do file
read(file, String)
end

# add deps to dockerfile
project_dict = Pkg.TOML.parsefile(Base.active_project())
packages = collect(keys(project_dict["deps"]))
sort!(packages)
pkg_install_cmd = "RUN julia -e \"using Pkg; "
for package_name in packages
if "compat" in keys(project_dict) && package_name in keys(project_dict["compat"])
pkg_install_cmd = pkg_install_cmd * "Pkg.add(name=\\\"" * package_name * "\\\", version=\\\"" * project_dict["compat"][package_name] * "\\\");"
else
pkg_install_cmd = pkg_install_cmd * "Pkg.add(\\\"" * package_name * "\\\");"
end
end
if "OpticSim" in packages
# build OpticSim if it is there
pkg_install_cmd = pkg_install_cmd * "Pkg.build(\\\"OpticSim\\\");"
end
pkg_install_cmd = pkg_install_cmd * "\""

dockerfile = dockerfile * pkg_install_cmd

# TODO maybe compile sysimage in docker - would be horribly slow but should speed up import a lot?

source_directory = joinpath(dirname(Base.active_project()))

if isfile(joinpath(source_directory, "Manifest.toml")) && !isfile(joinpath(source_directory, ".amlignore"))
println("No .amlignore file found, creating one")
open(joinpath(source_directory, ".amlignore"), "w") do io
write(io, "Manifest.toml\n")
end
end

# set up env for python stuff
try
pyimport("azureml.core")
catch
# FIXME maybe won't work, might need to restart Julia after this?
Conda.add("python=3.7")
Conda.add("pip=20.1.1")
Conda.pip_interop(true)
Conda.pip("install", "azureml-sdk")
Pkg.build("PyCall")
end

# copy entry_script from here to source_directory
entry_script_path = "entry_script_" * randstring() * ".py"
cp(joinpath(@__DIR__, "entry_script.py"), joinpath(source_directory, entry_script_path))

py"""
import os
import webbrowser
from azureml.core import Environment, Experiment, Run, Workspace, ScriptRunConfig
import azureml.train.hyperdrive as hyperdrive
from azureml.train.hyperdrive.parameter_expressions import choice

def get_hyperparam_dict(param_dict):
hyper_param_dict = {}
num_params = 1
for key, value in param_dict.items():
hyper_param_dict[key] = choice(value)
num_params = num_params * len(value)

return hyper_param_dict, num_params

def submit_run(subscription_id, resource_group, workspace_name, compute_name, source_directory, julia_script,
script_args, sampled_args, run_name, dockerfile, entry_script_path, hyperdrive_concurrent_runs):
workspace = Workspace(subscription_id, resource_group, workspace_name)

compute_target = workspace.compute_targets[compute_name]

env = Environment("opticsim")
env.docker.base_image = None
env.docker.base_dockerfile = dockerfile

args = [julia_script.replace(os.sep, "/")]
if script_args is not None:
args.extend(script_args)

src = ScriptRunConfig(source_directory=source_directory,
script=entry_script_path,
arguments=args,
compute_target=compute_target,
environment=env)
src.run_config.docker.use_docker = True

exp_name = os.getlogin() + "-opticsim"
experiment = Experiment(workspace, exp_name)

if sampled_args is not None:
sampling_params, num_params = get_hyperparam_dict(sampled_args)
param_sampling = hyperdrive.GridParameterSampling(sampling_params)
hyperdrive_run_config = hyperdrive.HyperDriveConfig(run_config=src,
hyperparameter_sampling=param_sampling,
max_concurrent_runs=hyperdrive_concurrent_runs,
primary_metric_name="",
primary_metric_goal=hyperdrive.PrimaryMetricGoal.MINIMIZE,
max_total_runs=num_params)
run_object = experiment.submit(hyperdrive_run_config, tags={"run_name": run_name})
else:
run_object = experiment.submit(src, tags={"run_name": run_name})

webbrowser.open_new(run_object.get_portal_url())
"""

py"submit_run"(subscription_id, resource_group, workspace_name, compute_name, source_directory, path_to_script,
script_args, sampled_args, run_name, dockerfile, entry_script_path, hyperdrive_concurrent_runs)

# remove the entry script
rm(joinpath(source_directory, entry_script_path))
end

export submit_run_to_AML, cache_run_config, get_cached_run_config

end
15 changes: 15 additions & 0 deletions src/Cloud/dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
FROM mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04

ENV JULIA_VERSION_MAJOR 1.6
ENV JULIA_VERSION_MINOR 0

# install stuff for xvfb-run
RUN apt install -y libxt6 libxrender1 libxext6 libgl1-mesa-glx libqt5widgets5 xvfb

# install julia
ENV JULIA_VERSION $JULIA_VERSION_MAJOR.$JULIA_VERSION_MINOR

RUN wget https://julialang-s3.julialang.org/bin/linux/x64/$JULIA_VERSION_MAJOR/julia-$JULIA_VERSION-linux-x86_64.tar.gz -O julia-$JULIA_VERSION-linux-x86_64.tar.gz
RUN tar -xvzf julia-$JULIA_VERSION-linux-x86_64.tar.gz
RUN mv julia-$JULIA_VERSION/ /opt/
RUN ln -s /opt/julia-$JULIA_VERSION/bin/julia /usr/local/bin/julia
12 changes: 12 additions & 0 deletions src/Cloud/entry_script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import os
import sys
import multiprocessing

# set up the project (should already be installed in base through docker so should be quick)
ret = os.system(f"xvfb-run julia --project -e \"using Pkg; Pkg.resolve()\"")
if os.WEXITSTATUS(ret) != 0:
sys.exit(os.WEXITSTATUS(ret))

# run the script
ret = os.system(f"JULIA_NUM_THREADS={multiprocessing.cpu_count()} xvfb-run julia --project {' '.join(sys.argv[1:])}")
sys.exit(os.WEXITSTATUS(ret))
1 change: 1 addition & 0 deletions src/OpticSim.jl
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ include("Optical/Emitters.jl") # defines the Emitters module

include("Examples/Examples.jl")
include("Optimization/Optimization.jl")
include("Cloud/Cloud.jl")

#initialize these caches here so they will get the correct number of threads from the load time environment, rather than the precompile environment. The latter happens if the initialization happens in the const definition. If the precompile and load environments have different numbers of threads this will cause an error.
function __init__()
Expand Down
Loading