diff --git a/cmd/project/defaults.go b/cmd/project/defaults.go new file mode 100644 index 0000000..ccaa351 --- /dev/null +++ b/cmd/project/defaults.go @@ -0,0 +1,15 @@ +package project + +// Returns default model name for the example model type +func getDefaultModelName(modelType string) string { + switch modelType { + case "LLM": + return "google/flan-t5-base" + case "Stable Diffusion": + return "stabilityai/sdxl-turbo" + case "Text to Audio": + return "facebook/musicgen-small" + } + + return "" +} diff --git a/cmd/project/example.toml b/cmd/project/example.toml deleted file mode 100644 index c0e120e..0000000 --- a/cmd/project/example.toml +++ /dev/null @@ -1,32 +0,0 @@ -# RunPod Project Configuration - -name = "placeholder" - -[project] -uuid = "placeholder" -base_image = "runpod/base:0.4.4-cuda11.8.0" -gpu_types = [ - "NVIDIA RTX A4000", - "NVIDIA RTX A4500", - "NVIDIA RTX A5000", - "NVIDIA GeForce RTX 3090", - "NVIDIA RTX A6000", -] -gpu_count = 1 -volume_mount_path = "/runpod-volume" -ports = "8080/http, 22/tcp, 4040/http" -container_disk_size_gb = 10 - -[project.env_vars] -POD_INACTIVITY_TIMEOUT = "120" -RUNPOD_DEBUG_LEVEL = "debug" -UVICORN_LOG_LEVEL = "warning" - -[template] -model_type = "default" -model_name = "None" - -[runtime] -python_version = "3.10" -handler_path = "src/handler.py" -requirements_path = "builder/requirements.txt" diff --git a/cmd/project/functions.go b/cmd/project/functions.go index 84e3b19..57a547d 100644 --- a/cmd/project/functions.go +++ b/cmd/project/functions.go @@ -6,13 +6,13 @@ import ( "errors" "fmt" "io/fs" + "log" "os" "path" "path/filepath" "strings" "time" - "github.com/google/uuid" "github.com/pelletier/go-toml" ) @@ -21,9 +21,6 @@ import ( //go:embed starter_examples/* starter_examples/*/.* var starterTemplates embed.FS -//go:embed example.toml -var tomlTemplate embed.FS - //go:embed exampleDockerfile var dockerfileTemplate embed.FS @@ -67,24 +64,42 @@ func copyFiles(files fs.FS, source string, dest string) error { }) } -func createNewProject(projectName string, cudaVersion string, - pythonVersion string, modelType string, modelName string, initCurrentDir bool) { - projectFolder, _ := os.Getwd() +func createNewProject(projectName string, cudaVersion string, pythonVersion string, modelType string, modelName string, initCurrentDir bool) { + projectFolder, err := os.Getwd() + if err != nil { + log.Fatalf("Failed to get current working directory: %v", err) + } + if !initCurrentDir { projectFolder = filepath.Join(projectFolder, projectName) - _, err := os.Stat(projectFolder) - if os.IsNotExist(err) { - os.Mkdir(projectFolder, 0755) + + if _, err := os.Stat(projectFolder); os.IsNotExist(err) { + if err := os.Mkdir(projectFolder, 0755); err != nil { + log.Fatalf("Failed to create project directory: %v", err) + } } + if modelType == "" { modelType = "default" } - templatePath := fmt.Sprintf("%s/%s", basePath, modelType) - //load selected starter template - err = copyFiles(starterTemplates, templatePath, projectFolder) - if err != nil { - panic(err) + + if modelName == "" { + modelName = getDefaultModelName(modelType) } + + examplePath := fmt.Sprintf("%s/%s", basePath, modelType) + err = copyFiles(starterTemplates, examplePath, projectFolder) + if err := copyFiles(starterTemplates, examplePath, projectFolder); err != nil { + log.Fatalf("Failed to copy starter example: %v", err) + } + + // Swap out the model name in handler.py + handlerPath := fmt.Sprintf("%s/src/handler.py", projectFolder) + handlerContentBytes, _ := os.ReadFile(handlerPath) + handlerContent := string(handlerContentBytes) + handlerContent = strings.ReplaceAll(handlerContent, "<>", modelName) + os.WriteFile(handlerPath, []byte(handlerContent), 0644) + requirementsPath := fmt.Sprintf("%s/builder/requirements.txt", projectFolder) requirementsContentBytes, _ := os.ReadFile(requirementsPath) requirementsContent := string(requirementsContentBytes) @@ -93,19 +108,8 @@ func createNewProject(projectName string, cudaVersion string, requirementsContent = strings.ReplaceAll(requirementsContent, "<>", "runpod") os.WriteFile(requirementsPath, []byte(requirementsContent), 0644) } - //generate project toml - tomlBytes, _ := tomlTemplate.ReadFile("example.toml") - projectToml, _ := toml.LoadBytes(tomlBytes) - projectUuid := uuid.New().String()[0:8] - projectToml.SetComment("RunPod Project Configuration") //TODO why does this not appear - projectToml.SetPath([]string{"name"}, projectName) - projectToml.SetPath([]string{"project", "uuid"}, projectUuid) - projectToml.SetPath([]string{"project", "base_image"}, baseDockerImage(cudaVersion)) - // projectToml.SetPath([]string{"template", "model_type"}, modelType) - // projectToml.SetPath([]string{"template", "model_name"}, modelName) - projectToml.SetPath([]string{"runtime", "python_version"}, pythonVersion) - tomlPath := filepath.Join(projectFolder, "runpod.toml") - os.WriteFile(tomlPath, []byte(projectToml.String()), 0644) + + generateProjectToml(projectFolder, "runpod.toml", projectName, cudaVersion, pythonVersion) } func loadProjectConfig() *toml.Tree { @@ -241,8 +245,19 @@ func startProject(networkVolumeId string) error { //parse project toml config := loadProjectConfig() fmt.Println(config) - projectId := config.GetPath([]string{"project", "uuid"}).(string) - projectName := config.GetPath([]string{"name"}).(string) + + // Project ID + projectId, ok := config.GetPath([]string{"project", "uuid"}).(string) + if !ok { + return fmt.Errorf("project ID not found in config") + } + + // Project Name + projectName, ok := config.GetPath([]string{"name"}).(string) + if !ok { + return fmt.Errorf("project name not found in config") + } + //check for existing pod projectPodId, err := getProjectPod(projectId) if projectPodId == "" || err != nil { @@ -252,12 +267,14 @@ func startProject(networkVolumeId string) error { return err } } + //open ssh connection sshConn, err := PodSSHConnection(projectPodId) if err != nil { fmt.Println("error establishing SSH connection to Pod: ", err) return err } + fmt.Println(fmt.Sprintf("Project %s Pod (%s) created.", projectName, projectPodId)) //create remote folder structure projectConfig := config.Get("project").(*toml.Tree) @@ -295,107 +312,152 @@ func startProject(networkVolumeId string) error { python -m pip install -v --requirement %s --report /installreport.json`, venvPath, remoteProjectPath, config.GetPath([]string{"runtime", "requirements_path"}).(string)), }) + //create file watcher fmt.Println("Creating Project watcher...") go sshConn.SyncDir(cwd, projectPathUuidDev) + //run launch api server / hot reload loop pipReqPath := path.Join(remoteProjectPath, config.GetPath([]string{"runtime", "requirements_path"}).(string)) handlerPath := path.Join(remoteProjectPath, config.GetPath([]string{"runtime", "handler_path"}).(string)) launchApiServer := fmt.Sprintf(` - pkill inotify + #!/bin/bash + API_PORT=8080 + API_HOST="0.0.0.0" + PYTHON_VENV_PATH="%s" # Path to the Python virutal environment used during development located on the Pod at //venv + PROJECT_DIRECTORY="%s/%s" + VENV_ARCHIVE_PATH="%s" + HANDLER_PATH="%s" + REQUIRED_FILES="%s" + + pkill inotify # Kill any existing inotify processes + + function start_api_server { + lsof -ti:$API_PORT | xargs kill -9 2>/dev/null # Kill the old API server if it's still running + python $1 --rp_serve_api --rp_api_host="$API_HOST" --rp_api_port=$API_PORT --rp_api_concurrency=1 & + SERVER_PID=$! + } + + function force_kill { + if [[ -z "$1" ]]; then + echo "No PID provided for force_kill." + return + fi + + kill $1 2>/dev/null - function force_kill { - kill $1 2>/dev/null - sleep 1 + for i in {1..5}; do # Wait up to 5 seconds, checking every second. + if ! ps -p $1 > /dev/null 2>&1; then + echo "Process $1 has been gracefully terminated." + return + fi + sleep 1 + done - if ps -p $1 > /dev/null; then echo "Graceful kill failed, attempting SIGKILL..." kill -9 $1 2>/dev/null - sleep 1 - if ps -p $1 > /dev/null; then - echo "Failed to kill process with PID: $1" - exit 1 - else - echo "Killed process with PID: $1 using SIGKILL" - fi + for i in {1..5}; do # Wait up to 5 seconds, checking every second. + if ! ps -p $1 >/dev/null 2>&1; then + echo "Process $1 has been killed with SIGKILL." + return + fi + sleep 1 + done + + echo "Failed to kill process with PID: $1 after SIGKILL attempt." + exit 1 + } + + function cleanup { + echo "Cleaning up..." + force_kill $SERVER_PID + } + trap cleanup EXIT SIGINT + if source $PYTHON_VENV_PATH/bin/activate; then + echo -e "- Activated project environment." else - echo "Killed process with PID: $1" + echo "Failed to activate project environment." + exit 1 fi - } - function cleanup { - echo "Cleaning up..." - force_kill $last_pid - } - - trap cleanup EXIT SIGINT + if cd $PROJECT_DIRECTORY; then + echo -e "- Changed to project directory." + else + echo "Failed to change directory." + exit 1 + fi - if source %s/bin/activate; then - echo -e "- Activated project environment." - else - echo "Failed to activate project environment." - exit 1 - fi + function tar_venv { + if ! [ $(cat /installreport.json | grep "install" | grep -c "\[\]") -eq 1 ] + then + tar -c -C $PYTHON_VENV_PATH . | zstd -T0 > /venv.tar.zst; + mv /venv.tar.zst $VENV_ARCHIVE_PATH ; + echo "Synced venv to network volume" + fi + } - if cd %s/%s; then - echo -e "- Changed to project directory." - else - echo "Failed to change directory." - exit 1 - fi + tar_venv & - function tar_venv { - if ! [ $(cat /installreport.json | grep "install" | grep -c "\[\]") -eq 1 ] - then - tar -c -C %s . | zstd -T0 > /venv.tar.zst; - mv /venv.tar.zst %s; - echo "synced venv to network volume" - fi - } + # Start the API server in the background, and save the PID + start_api_server $HANDLER_PATH - # Start the API server in the background, and save the PID - tar_venv & - python %s --rp_serve_api --rp_api_host="0.0.0.0" --rp_api_port=8080 --rp_api_concurrency=1 & - last_pid=$! - - echo -e "- Started API server with PID: $last_pid" && echo "" - echo "Connect to the API server at:" - echo "> https://$RUNPOD_POD_ID-8080.proxy.runpod.net" && echo "" - - #like inotifywait, but will only report the name of a file if it shouldn't be ignored according to .runpodignore - #uses git check-ignore to ensure same syntax as gitignore, but git check-ignore expects to be run in a repo - #so we must set up a git-repo-like file structure in some temp directory - function notify_nonignored_file { - tmp_dir=$(mktemp -d) - cp .runpodignore $tmp_dir/.gitignore && cd $tmp_dir && git init -q #setup fake git in temp dir - echo $(inotifywait -q -r -e modify,create,delete %s --format '%%w%%f' | xargs -I _ sh -c 'realpath --relative-to="%s" "_" | git check-ignore -nv --stdin | grep :: | tr -d :[":blank:"]') - rm -rf $tmp_dir - } + echo -e "- Started API server with PID: $SERVER_PID" && echo "" + echo "Connect to the API server at:" + echo "> https://$RUNPOD_POD_ID-8080.proxy.runpod.net" && echo "" - while true; do - if changed_file=$(notify_nonignored_file); then - echo "Found changes in: $changed_file" - else - echo "No changes found." - exit 1 - fi + #like inotifywait, but will only report the name of a file if it shouldn't be ignored according to .runpodignore + #uses git check-ignore to ensure same syntax as gitignore, but git check-ignore expects to be run in a repo + #so we must set up a git-repo-like file structure in some temp directory + function notify_nonignored_file { + local tmp_dir=$(mktemp -d) + cp .runpodignore "$tmp_dir/.gitignore" + cd "$tmp_dir" && git init -q # Setup a temporary git repo to leverage .gitignore - force_kill $last_pid + local project_directory="$PROJECT_DIRECTORY" - if [[ $changed_file == *"requirements"* ]]; then - echo "Installing new requirements..." - python -m pip install --upgrade pip && python -m pip install -r %s --report /installreport.json - tar_venv & - fi + # Listen for file changes. + inotifywait -q -r -e modify,create,delete --format '%%w%%f' "$project_directory" | while read -r file; do + # Convert each file path to a relative path and check if it's ignored by git + local rel_path=$(realpath --relative-to="$project_directory" "$file") + if ! git check-ignore -q "$rel_path"; then + echo "$rel_path" + fi + done - python %s --rp_serve_api --rp_api_host="0.0.0.0" --rp_api_port=8080 --rp_api_concurrency=1 & - last_pid=$! + cd - > /dev/null # Return to the original directory + rm -rf "$tmp_dir" + } + trap '[[ -n $tmp_dir && -d $tmp_dir ]] && rm -rf "$tmp_dir"' EXIT + + monitor_and_restart() { + while true; do + if changed_file=$(notify_nonignored_file); then + echo "Found changes in: $changed_file" + else + echo "No changes found." + exit 1 + fi + + force_kill $SERVER_PID + + # Install new requirements if requirements.txt was changed + if [[ $changed_file == *"requirements"* ]]; then + echo "Installing new requirements..." + python -m pip install --upgrade pip && python -m pip install -r $REQUIRED_FILES --report /installreport.json + tar_venv & + fi + + # Restart the API server in the background, and save the PID + start_api_server $HANDLER_PATH + + echo "Restarted API server with PID: $SERVER_PID" + done + } - echo "Restarted API server with PID: $last_pid" - done - `, venvPath, projectPathUuidDev, projectName, venvPath, archivedVenvPath, handlerPath, remoteProjectPath, remoteProjectPath, pipReqPath, handlerPath) + monitor_and_restart + `, venvPath, projectPathUuidDev, projectName, archivedVenvPath, handlerPath, pipReqPath) fmt.Println() fmt.Println("Starting project endpoint...") sshConn.RunCommand(launchApiServer) diff --git a/cmd/project/tomlBuilder.go b/cmd/project/tomlBuilder.go new file mode 100644 index 0000000..df24b38 --- /dev/null +++ b/cmd/project/tomlBuilder.go @@ -0,0 +1,96 @@ +package project + +import ( + "fmt" + "os" + "path/filepath" + + "github.com/google/uuid" +) + +func generateProjectToml(projectFolder, filename, projectName, cudaVersion, pythonVersion string) { + template := `# RunPod Project Configuration + + +name = "%s" + + +[project] +uuid = "%s" # Unique identifier for the project. Generated automatically. + +# Base Docker image used for the project environment. Includes essential packages and CUDA support. +# Use 'runpod/base' as a starting point. Customize only if you need additional packages or configurations. +base_image = "runpod/base:0.5.0-cuda%s" + +# List of preferred GPU types for your development pod, ordered by priority. +# The pod will use the first available type from this list. +# For a full list of supported GPU types, visit: https://docs.runpod.io/references/gpu-types +gpu_types = [ + "NVIDIA GeForce RTX 4080", # 16GB + "NVIDIA RTX A4000", # 16GB + "NVIDIA RTX A4500", # 20GB + "NVIDIA RTX A5000", # 24GB + "NVIDIA GeForce RTX 3090", # 24GB + "NVIDIA GeForce RTX 4090", # 24GB + "NVIDIA RTX A6000", # 48GB + "NVIDIA A100 80GB PCIe", # 80GB +] + +gpu_count = 1 + +# Default volume mount path in serverless environment. Changing this may affect data persistence. +volume_mount_path = "/runpod-volume" + +# Ports to expose and their protocols. Configure as needed for your application's requirements. +# The base image uses 4040 for FileBrowser, 8080 for FastAPI and 22 for SSH +ports = "4040/http, 8080/http, 22/tcp" + +# Disk space allocated for the container. Adjust according to your project's needs. +container_disk_size_gb = 100 + + +[project.env_vars] +# Environment variables for the pod. + +# Duration (in seconds) before terminating the pod after the last SSH session ends. +POD_INACTIVITY_TIMEOUT = "120" + +RUNPOD_DEBUG_LEVEL = "debug" +UVICORN_LOG_LEVEL = "warning" + +# Configurations for caching Hugging Face models and datasets to improve load times and reduce bandwidth. +HF_HOME = "/runpod-volume/.cache/huggingface/" +HF_DATASETS_CACHE = "/runpod-volume/.cache/huggingface/datasets/" +DEFAULT_HF_METRICS_CACHE = "/runpod-volume/.cache/huggingface/metrics/" +DEFAULT_HF_MODULES_CACHE = "/runpod-volume/.cache/huggingface/modules/" +HUGGINGFACE_HUB_CACHE = "/runpod-volume/.cache/huggingface/hub/" +HUGGINGFACE_ASSETS_CACHE = "/runpod-volume/.cache/huggingface/assets/" + +# Enable this to use the HF Hub transfer service for faster Hugging Face downloads. +HF_HUB_ENABLE_HF_TRANSFER = "1" # Requires 'hf_transfer' Python package. + +# Directories for caching Python dependencies, speeding up subsequent installations. +VIRTUALENV_OVERRIDE_APP_DATA = "/runpod-volume/.cache/virtualenv/" +PIP_CACHE_DIR = "/runpod-volume/.cache/pip/" + + +[runtime] +# Runtime configuration for the project. + +python_version = "%s" +handler_path = "src/handler.py" +requirements_path = "builder/requirements.txt" +` + + // Format the template with dynamic content + content := fmt.Sprintf(template, projectName, uuid.New().String()[0:8], cudaVersion, pythonVersion) + + // Write the content to a TOML file + tomlPath := filepath.Join(projectFolder, filename) + err := os.WriteFile(tomlPath, []byte(content), 0644) + if err != nil { + fmt.Printf("Failed to write the TOML file: %s\n", err) + } else { + fmt.Println("TOML file generated successfully with dynamic content.") + } +}