This tutorial combines local WSI preprocessing using lazyslide with scalable cloud inference using Amazon SageMaker and the H-optimus-1 foundation model. The below is a transcript of a python notebook with sample outputs. Please see below for a download of the complete notebook.
lazyslide to segment tissues and generate image tiles (patches) from Whole Slide Images (WSIs) located in a local ./data directory..npy) containing the embeddings for each slide and store them on S3.Ensure you have the following installed:
lazyslidesagemakerboto3numpytqdmPillowNote: You must have an AWS account and appropriate IAM permissions to use SageMaker and S3. You also need to subscribe to H-optimus-1 on the AWS Marketplace.
# Tested using python 3.13.7
%pip install sagemaker=="2.237.1"
%pip install pillow=="11.1.0"
%pip install lazyslide boto3 numpy tqdm
import lazyslide as zs
import sagemaker
import boto3
import numpy as np
import pandas as pd
import os
import glob
import json
import time
from PIL import Image
from tqdm.notebook import tqdm
from io import BytesIO
from sagemaker import ModelPackage, get_execution_role
from datetime import datetime
# Configuration
DATA_DIR = "data/"
PATCHES_DIR = os.path.join(DATA_DIR, "patches")
MODEL_NAME = "h-optimus-1"
REGION = boto3.Session().region_name
# Ensure local directories exist
os.makedirs(PATCHES_DIR, exist_ok=True)
# AWS Setup
session = sagemaker.Session()
role = get_execution_role()
bucket = session.default_bucket()
s3_client = boto3.client('s3')
sm_client = boto3.client('sagemaker')
We will iterate through slides in the ./data directory. For each slide, we use lazyslide to find tissue, determine tile coordinates, and then save those tiles as PNG images to disk.
# Find all slide files (assuming .svs format)
slide_paths = glob.glob(os.path.join(DATA_DIR, "*.svs"))
print(f"Found {len(slide_paths)} slides to process.")
processed_slides = []
for slide_path in tqdm(slide_paths, desc="Processing Slides"):
slide_filename = os.path.basename(slide_path)
slide_name, _ = os.path.splitext(slide_filename)
# Create a directory for this slide's patches
slide_patch_dir = os.path.join(PATCHES_DIR, slide_name)
os.makedirs(slide_patch_dir, exist_ok=True)
print(f"\\nProcessing {slide_filename}...")
# 1. Open WSI
wsi = zs.open_wsi(slide_path)
# 2. Find tissue and generate tile coordinates
# We use 224x224 as required by H-optimus-1, typically at 0.5 microns per pixel (20x)
zs.pp.find_tissues(wsi)
zs.pp.tile_tissues(wsi, tile_px=224, mpp=0.5)
wsi.write()
# 3. Export Tiles to Disk
wsi = zs.open_wsi(slide_path)
try:
# Retrieve coordinates (x, y) of the top-left corner of tiles and store in list
coords_data = []
# Get top-left x and y coords
for polygon in wsi["tiles"]["geometry"]:
coords = list(polygon.exterior.coords)
xf, yf = coords[0]
x = int(xf)
y = int(yf)
row_dict = {'x': x, 'y': y}
coords_data.append(row_dict)
total_tiles = len(coords_data)
coords_df = pd.DataFrame(coords_data)
coords = zip(coords_df['x'], coords_df['y'])
print(f"Exporting {total_tiles} patches...")
# Use OpenSlide (via the wsi object) to read regions and save
for i, (x, y) in enumerate(tqdm(coords, total=total_tiles, leave=False)):
# Format serial number: slide_name_000001
serial_number = f"{i:06d}"
patch_filename = f"{slide_name}_{serial_number}.png"
save_path = os.path.join(slide_patch_dir, patch_filename)
# Read region (x,y are usually level 0 coordinates)
tile_image = wsi.read_region(x, y, 224, 224)
# If tile_image is a numpy array, convert to PIL
if isinstance(tile_image, np.ndarray):
tile_image = Image.fromarray(tile_image)
tile_image.save(save_path)
processed_slides.append(slide_name)
except Exception as e:
print(f"Error processing {slide_name}: {e}")
Found 2 slides to process.
Processing GTEX-1117F-1026.svs...
Exporting 3254 patches...
Processing GTEX-111FC-0426.svs...
Exporting 655 patches...
We upload the organized patch folders to S3. The structure on S3 will be s3://<bucket>/<model_name>/patches/<slide_name>/<images>.
NOTE: This step can take a long time if you have multiple slides with many 1000s of patches - we recommend you test on a small set of small WSIs.
s3_input_prefix = f"{MODEL_NAME}/patches"
print(f"Uploading patches to s3://{bucket}/{s3_input_prefix}...")
# Upload using sagemaker session helper
# We upload the whole 'patches' directory to the input prefix
transform_input_uri = session.upload_data(
path=PATCHES_DIR,
bucket=bucket,
key_prefix=s3_input_prefix
)
print(f"Upload complete. Data available at: {transform_input_uri}")
Uploading patches to s3://sagemaker-eu-north-1-840737971346/h-optimus-1/patches...
Upload complete.
Data available at: s3://sagemaker-eu-north-1-840737971346/h-optimus-1/patches
Define the model package ARN (ensure this matches your region) and create the SageMaker model object.
# H-optimus-1 ARN for eu-north-1 (Example).
# PLEASE REPLACE with the ARN for your specific region from the AWS Marketplace subscription page.
model_package_arn = "arn:aws:sagemaker:eu-north-1:136758871317:model-package/h-optimus-1-7f16e68f69cf3b7bb608d126ac6b9a99"
# Create the model objectprint(f"Creating Model: {MODEL_NAME}...")
create_model_response = sm_client.create_model(
ModelName=MODEL_NAME,
ExecutionRoleArn=role,
PrimaryContainer={
# This tells SageMaker to use the Model Package definition
"ModelPackageName": model_package_arn
},
EnableNetworkIsolation=True)
print("Model object created.")
Creating Model: h-optimus-1...
Model object created.
We will iterate through each processed slide and launch a Batch Transform job.
s3://<bucket>/h-optimus-1/patches/<slide_name>/s3://<bucket>/h-optimus-1_output/<slide_name>/NOTE: This step can take a long time if you have a lot of slides, we recommend testing with a small set of small WSIs.
batch_instance_type = "ml.g5.xlarge"
output_base_prefix = f"{MODEL_NAME}_output"
# Use boto3 create_transform_method to create the transform job
for slide_name in tqdm(processed_slides, desc="Running Batch Jobs"):
print(f"\\nStarting Batch Transform for slide: {slide_name}")
# Define input and output specific to this slide
s3_slide_input = f"s3://{bucket}/{s3_input_prefix}/{slide_name}"
s3_slide_output = f"s3://{bucket}/{output_base_prefix}/{slide_name}"
transform_job_name = f"transform-job-{slide_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}"
print(f"Starting Transform Job: {transform_job_name}...")
response = sm_client.create_transform_job(
TransformJobName=transform_job_name,
ModelName=MODEL_NAME, # Reference the model created in Step above
MaxConcurrentTransforms=1,
MaxPayloadInMB=6,
BatchStrategy="MultiRecord",
TransformInput={
"DataSource": {
"S3DataSource": {
"S3DataType": "S3Prefix", # Processing all files under the prefix
"S3Uri": s3_slide_input
}
},
"ContentType": "application/x-image",
"SplitType": "None",
"CompressionType": "None"
},
TransformOutput={
"S3OutputPath": s3_slide_output,
"AssembleWith": "Line",
"Accept": "application/json"
},
TransformResources={
"TransformAmiVersion": "al2-ami-sagemaker-batch-gpu-535",
"InstanceType": batch_instance_type,
"InstanceCount": 1
}
)
print(f"Transform Job ARN: {response['TransformJobArn']}")
print("Waiting for job to complete...")
start_time = time.time()
waiter = sm_client.get_waiter('transform_job_completed_or_stopped')
waiter.wait(TransformJobName=transform_job_name)
end_time = time.time()
# Calculate duration
duration_seconds = end_time - start_time
minutes = int(duration_seconds // 60)
seconds = int(duration_seconds % 60)
# Check final status
status = sm_client.describe_transform_job(TransformJobName=transform_job_name)
print(f" Job finished with status: {status['TransformJobStatus']}")
print(f" Total Wait Time: {minutes}m {seconds}s")
print(f"Job finished for {slide_name}. Output stored at {s3_slide_output}")
Starting Batch Transform for slide: GTEX-1117F-1026
Starting Transform Job: transform-job-GTEX-1117F-1026-2025-12-07-14-14-20...
Transform Job ARN: arn:aws:sagemaker:eu-north-1:840737971346:transform-job/transform-job-GTEX-1117F-1026-2025-12-07-14-14-20
Waiting for job to complete...
Job finished with status: Completed
Total Wait Time: 20m 5s
Job finished for GTEX-1117F-1026. Output stored at s3://sagemaker-eu-north-1-840737971346/h-optimus-1_output/GTEX-1117F-1026
Starting Batch Transform for slide: GTEX-111FC-0426
Starting Transform Job: transform-job-GTEX-111FC-0426-2025-12-07-14-34-26...
Transform Job ARN: arn:aws:sagemaker:eu-north-1:840737971346:transform-job/transform-job-GTEX-111FC-0426-2025-12-07-14-34-26
Waiting for job to complete...
Job finished with status: Completed
Total Wait Time: 12m 3s
Job finished for GTEX-111FC-0426. Output stored at s3://sagemaker-eu-north-1-840737971346/h-optimus-1_output/GTEX-111FC-0426
Now that the embeddings are generated as JSON files in S3 (files ending in .out), we will:
(N_tiles, 1536)..npy file locally and upload it to s3://<bucket>/h-optimus-1_output/slide_embeddings/.embedding_s3_prefix = f"{output_base_prefix}/slide_embeddings"
for slide_name in tqdm(processed_slides, desc="Consolidating Embeddings"):
print(f"Processing outputs for {slide_name}...")
# List all objects in the slide's output directory
slide_output_prefix = f"{output_base_prefix}/{slide_name}/"
# Use paginator to handle cases with >1000 tiles
paginator = s3_client.get_paginator('list_objects_v2')
pages = paginator.paginate(Bucket=bucket, Prefix=slide_output_prefix)
feature_list = []
# Collect all keys first to sort them by serial number
all_keys = []
for page in pages:
if 'Contents' in page:
for obj in page['Contents']:
key = obj['Key']
if key.endswith('.out'):
all_keys.append(key)
if not all_keys:
print(f"No output files found for {slide_name}.")
continue
# Sort keys to ensure the numpy array order matches the input image sequence
# Filename format: <slide>_000001.png.out
all_keys.sort()
# Check keys
print(f"First output = {all_keys[0]}")
print(f"Second output = {all_keys[1]}")
print(f"Length of output keys = {len(all_keys)}")
# Download and parse JSONs
for key in tqdm(all_keys, leave=False, desc="Downloading/Parsing"):
response = s3_client.get_object(Bucket=bucket, Key=key)
file_content = response['Body'].read().decode('utf-8')
# The output format from Batch Transform is usually the JSON response
try:
json_content = json.loads(file_content)
# H-optimus-1 usually returns a list of embeddings, we expect 1 per image
# Shape: [1, 1536]
embedding = json_content[0]
feature_list.append(embedding)
except Exception as e:
print(f"Failed to parse {key}: {e}")
# Convert to Numpy Array
embeddings_array = np.array(feature_list)
if embeddings_array.shape[1] != 1536:
print(f"Warning: Unexpected embedding dimension for {slide_name}: {embeddings_array.shape}")
# Save locally
local_npy_path = os.path.join(DATA_DIR, f"{slide_name}.npy")
np.save(local_npy_path, embeddings_array)
# Upload to S3
s3_output_key = f"{embedding_s3_prefix}/{slide_name}.npy"
s3_client.upload_file(local_npy_path, bucket, s3_output_key)
print(f"Successfully saved {slide_name}.npy shape={embeddings_array.shape}")
print("\\n--- Processing Complete ---")
print(f"All slide embeddings are stored in S3 at: s3://{bucket}/{embedding_s3_prefix}/")
Processing outputs for GTEX-1117F-1026...
First output = h-optimus-1_output/GTEX-1117F-1026/GTEX-1117F-1026_000000.png.out
Second output = h-optimus-1_output/GTEX-1117F-1026/GTEX-1117F-1026_000001.png.out
Length of output keys = 3254
Successfully saved GTEX-1117F-1026.npy shape=(3254, 1536)
Processing outputs for GTEX-111FC-0426...
First output = h-optimus-1_output/GTEX-111FC-0426/GTEX-111FC-0426_000000.png.out
Second output = h-optimus-1_output/GTEX-111FC-0426/GTEX-111FC-0426_000001.png.out
Length of output keys = 655
Successfully saved GTEX-111FC-0426.npy shape=(655, 1536)
--- Processing Complete ---
All slide embeddings are stored in S3 at: s3://sagemaker-eu-north-1-840737971346/h-optimus-1_output/slide_embeddings/
After finishing all steps we should delete the model.
# Clean up model
try:
sm_client.delete_model(ModelName=MODEL_NAME)
print(f" Successfully deleted model: {MODEL_NAME}")
except Exception as cleanup_error:
print(f" Warning: Could not delete model. It may have already been deleted or never created.")
print(f" Error details: {cleanup_error}")
Successfully deleted model: h-optimus-1
If you would like to unsubscribe to the model package, follow these steps. Before you cancel the subscription, ensure that you do not have any deployable model created from the model package or using the algorithm. Note - You can find this information by looking at the container name associated with the model.
Steps to unsubscribe to product from AWS Marketplace:
Complete Python Notebook for this tutorial:
Generating_embeddings_for_multiple_slides_on_AWS.md
<aside>
Quick Navigation
</aside>
Latest version: December 16, 2025
Support: [email protected]