SageMaker-EKS 하이브리드 ML 아키텍처
📅 작성일: 2026-02-13 | 수정일: 2026-02-14 | ⏱️ 읽는 시간: 약 3분
개요
SageMaker의 관리형 학습 환경과 EKS의 유연한 서빙 인프라를 결합한 하이브리드 ML 아키텍처를 설계합니다. 이 접근 방식은 각 플랫폼의 강점을 활용하여 비용 효율성과 운영 유연성을 동시에 달성합니다.
하이브리드 아키텍처의 장점
🔀 Hybrid Architecture Comparison
SageMaker Training vs EKS Serving vs Hybrid Approach
💰Cost
SageMaker Training
Charged Only During Training
EKS Serving
Continuous Operating Costs
Hybrid Benefit
Managed Training, Optimized Serving
📈Scalability
SageMaker Training
Auto-scaling
EKS Serving
Karpenter Dynamic Provisioning
Hybrid Benefit
Optimal Scaling per Workload
🔧Flexibility
SageMaker Training
Limited Customization
EKS Serving
Full Control
Hybrid Benefit
Standardized Training + Custom Serving
⚙️Operations
SageMaker Training
Fully Managed
EKS Serving
Self-managed
Hybrid Benefit
Reduced Training Burden + Serving Control
🔗Integration
SageMaker Training
AWS Native
EKS Serving
Kubernetes Ecosystem
Hybrid Benefit
Leverage Both Ecosystems
하이브리드 아키텍처 패턴
전체 아키텍처 개요
패턴 1: SageMaker 학습 → EKS 서빙
가장 일반적인 하이브리드 패턴으로, SageMaker에서 모델을 학습하고 EKS에서 서빙합니다.
사용 사례:
- 대규모 분산 학습이 필요한 경우
- 학습 인프라 관리 부담을 줄이고 싶은 경우
- 서빙 환경에서 세밀한 제어가 필요한 경우
패턴 2: EKS 학습 → SageMaker 서빙
특수한 학습 환경이 필요하지만 서빙은 관리형으로 운영하고 싶은 경우입니다.
사용 사례:
- 커스텀 학습 프레임워크 사용
- Kubernetes 네이티브 학습 도구 활용 (Kubeflow, Ray)
- 서빙 인프라 관리 부담을 줄이고 싶은 경우
패턴 3: 하이브리드 서빙
SageMaker Endpoint와 EKS 서빙을 동시에 운영하여 워크로드를 분산합니다.
사용 사례:
- 고가용성이 중요한 프로덕션 환경
- 멀티 리전 배포
- A/B 테스팅 및 카나리 배포
SageMaker Pipelines 통합
SageMaker Components for Kubeflow Pipelines
AWS는 Kubeflow Pipelines에서 SageMaker를 호출할 수 있는 공식 컴포넌트를 제공합니다.
# sagemaker_kubeflow_pipeline.py
import kfp
from kfp import dsl
from kfp.aws import use_aws_secret
import sagemaker
from sagemaker.workflow.pipeline_context import PipelineSession
@dsl.component(
base_image="public.ecr.aws/sagemaker/sagemaker-distribution:latest",
packages_to_install=["sagemaker>=2.200.0"]
)
def sagemaker_training_component(
training_image: str,
role_arn: str,
instance_type: str,
instance_count: int,
s3_input_data: str,
s3_output_path: str,
hyperparameters: dict
) -> str:
"""SageMaker Training Job 실행 컴포넌트"""
import boto3
import sagemaker
from sagemaker.estimator import Estimator
session = sagemaker.Session()
estimator = Estimator(
image_uri=training_image,
role=role_arn,
instance_count=instance_count,
instance_type=instance_type,
output_path=s3_output_path,
sagemaker_session=session,
hyperparameters=hyperparameters
)
estimator.fit({"training": s3_input_data}, wait=True)
# 모델 아티팩트 경로 반환
return estimator.model_data
@dsl.component(
base_image="public.ecr.aws/sagemaker/sagemaker-distribution:latest",
packages_to_install=["sagemaker>=2.200.0"]
)
def register_model_to_registry(
model_data: str,
model_package_group_name: str,
inference_image: str,
role_arn: str
) -> str:
"""SageMaker Model Registry에 모델 등록"""
import boto3
import sagemaker
from sagemaker.model import Model
session = sagemaker.Session()
model = Model(
image_uri=inference_image,
model_data=model_data,
role=role_arn,
sagemaker_session=session
)
# Model Registry에 등록
model_package = model.register(
content_types=["application/json"],
response_types=["application/json"],
inference_instances=["ml.g5.xlarge"],
transform_instances=["ml.g5.xlarge"],
model_package_group_name=model_package_group_name,
approval_status="PendingManualApproval"
)
return model_package.model_package_arn
@dsl.component(
base_image="python:3.10",
packages_to_install=["kubernetes", "boto3"]
)
def deploy_to_kserve(
model_package_arn: str,
model_name: str,
namespace: str = "kserve-inference"
) -> str:
"""KServe InferenceService 배포"""
import boto3
from kubernetes import client, config
# SageMaker Model Registry에서 모델 정보 조회
sm_client = boto3.client('sagemaker')
model_package = sm_client.describe_model_package(
ModelPackageName=model_package_arn
)
model_data_url = model_package['InferenceSpecification']['Containers'][0]['ModelDataUrl']
# KServe InferenceService 생성
config.load_incluster_config()
custom_api = client.CustomObjectsApi()
inference_service = {
"apiVersion": "serving.kserve.io/v1beta1",
"kind": "InferenceService",
"metadata": {
"name": model_name,
"namespace": namespace
},
"spec": {
"predictor": {
"pytorch": {
"storageUri": model_data_url,
"resources": {
"requests": {
"nvidia.com/gpu": "1",
"memory": "8Gi"
},
"limits": {
"nvidia.com/gpu": "1",
"memory": "16Gi"
}
}
}
},
"minReplicas": 2,
"maxReplicas": 10
}
}
custom_api.create_namespaced_custom_object(
group="serving.kserve.io",
version="v1beta1",
namespace=namespace,
plural="inferenceservices",
body=inference_service
)
return f"Deployed {model_name} to KServe"
@dsl.pipeline(
name="SageMaker to EKS Hybrid Pipeline",
description="Train on SageMaker, deploy to EKS"
)
def hybrid_ml_pipeline(
training_image: str = "763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.1.0-gpu-py310",
inference_image: str = "763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference:2.1.0-gpu-py310",
role_arn: str = "arn:aws:iam::123456789012:role/SageMakerExecutionRole",
instance_type: str = "ml.g5.2xlarge",
s3_input_data: str = "s3://my-bucket/training-data/",
s3_output_path: str = "s3://my-bucket/models/",
model_package_group: str = "fraud-detection-models"
):
# 1. SageMaker에서 학습
training_task = sagemaker_training_component(
training_image=training_image,
role_arn=role_arn,
instance_type=instance_type,
instance_count=2,
s3_input_data=s3_input_data,
s3_output_path=s3_output_path,
hyperparameters={
"epochs": "50",
"batch-size": "64",
"learning-rate": "0.001"
}
)
training_task.apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))
# 2. Model Registry에 등록
registry_task = register_model_to_registry(
model_data=training_task.output,
model_package_group_name=model_package_group,
inference_image=inference_image,
role_arn=role_arn
)
registry_task.apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))
# 3. EKS KServe에 배포
deploy_task = deploy_to_kserve(
model_package_arn=registry_task.output,
model_name="fraud-detection-v1",
namespace="kserve-inference"
)
return deploy_task.output
SageMaker Model Registry 거버넌스
중앙 집중식 모델 관리
SageMaker Model Registry는 모든 모델의 중앙 저장소 역할을 하며, EKS 서빙 환경에서도 동일한 거버넌스를 적용할 수 있습니다.
Model Registry 설정
# model_registry_setup.py
import boto3
import sagemaker
from sagemaker.model_package import ModelPackageGroup
sm_client = boto3.client('sagemaker')
session = sagemaker.Session()
# Model Package Group 생성
model_package_group_name = "fraud-detection-models"
try:
sm_client.create_model_package_group(
ModelPackageGroupName=model_package_group_name,
ModelPackageGroupDescription="Fraud detection models for production",
Tags=[
{"Key": "Team", "Value": "ml-platform"},
{"Key": "Environment", "Value": "production"}
]
)
except sm_client.exceptions.ResourceInUse:
print(f"Model package group {model_package_group_name} already exists")
# 모델 승인 정책 설정
model_approval_policy = {
"Rules": [
{
"Name": "AutoApproveHighAccuracy",
"Condition": {
"MetricName": "accuracy",
"Operator": "GreaterThanOrEqualTo",
"Value": 0.95
},
"Action": "Approve"
},
{
"Name": "RejectLowAccuracy",
"Condition": {
"MetricName": "accuracy",
"Operator": "LessThan",
"Value": 0.85
},
"Action": "Reject"
}
]
}
EKS에서 Model Registry 조회
# eks_model_loader.py
import boto3
from kubernetes import client, config
def get_approved_model_from_registry(model_package_group_name: str) -> str:
"""Model Registry에서 승인된 최신 모델 조회"""
sm_client = boto3.client('sagemaker')
# 승인된 모델 패키지 조회
response = sm_client.list_model_packages(
ModelPackageGroupName=model_package_group_name,
ModelApprovalStatus='Approved',
SortBy='CreationTime',
SortOrder='Descending',
MaxResults=1
)
if not response['ModelPackageSummaryList']:
raise ValueError(f"No approved models found in {model_package_group_name}")
model_package_arn = response['ModelPackageSummaryList'][0]['ModelPackageArn']
# 모델 상세 정보 조회
model_package = sm_client.describe_model_package(
ModelPackageName=model_package_arn
)
model_data_url = model_package['InferenceSpecification']['Containers'][0]['ModelDataUrl']
return model_data_url
def update_kserve_with_latest_model(model_name: str, namespace: str):
"""KServe InferenceService를 최신 승인 모델로 업데이트"""
config.load_incluster_config()
custom_api = client.CustomObjectsApi()
# Model Registry에서 최신 모델 조회
model_url = get_approved_model_from_registry("fraud-detection-models")
# InferenceService 업데이트
patch_body = {
"spec": {
"predictor": {
"pytorch": {
"storageUri": model_url
}
}
}
}
custom_api.patch_namespaced_custom_object(
group="serving.kserve.io",
version="v1beta1",
namespace=namespace,
plural="inferenceservices",
name=model_name,
body=patch_body
)
print(f"Updated {model_name} with model from {model_url}")
비용 최적화 전략
학습 vs 서빙 비용 분석
💰 Cost Optimization Strategy Comparison
Training vs Serving Cost Analysis and Optimization
💵Instance Cost
SageMaker Training
Charged Only During Training
EKS Serving
24/7 Operating Costs
Optimization Strategy
Utilize Spot Instances
💾Storage
SageMaker Training
S3 (Low Cost)
EKS Serving
EBS + S3
Optimization Strategy
S3-Centric Architecture
🌐Network
SageMaker Training
Free Within VPC
EKS Serving
Data Transfer Costs
Optimization Strategy
Use VPC Endpoints
👥Management Overhead
SageMaker Training
None
EKS Serving
Operations Staff Required
Optimization Strategy
Offset with Automation
비용 최적화 체크리스트
# cost-optimization-config.yaml
training:
# SageMaker Managed Spot Training (최대 90% 절감)
use_spot_instances: true
max_wait_time_seconds: 86400 # 24시간
max_run_time_seconds: 43200 # 12시간
# 체크포인트 활성화 (Spot 중단 대비)
checkpoint_s3_uri: s3://my-bucket/checkpoints/
checkpoint_local_path: /opt/ml/checkpoints
# 인스턴스 타입 최적화
instance_type: ml.g5.2xlarge # GPU 학습
instance_count: 2
# 학습 완료 후 자동 종료
auto_terminate: true
serving:
# Karpenter Spot 인스턴스 (최대 70% 절감)
capacity_type: spot
# 오토스케일링 설정
min_replicas: 1
max_replicas: 10
target_utilization: 70
# 유휴 시간 스케일 다운
scale_down_delay: 300 # 5분
# GPU 공유 (MIG 또는 MPS)
enable_gpu_sharing: true
max_shared_clients: 4
storage:
# S3 Intelligent-Tiering
s3_storage_class: INTELLIGENT_TIERING
# 오래된 모델 아카이브
lifecycle_policy:
archive_after_days: 90
delete_after_days: 365
비용 모니터링 대시보드
# cost_monitoring.py
import boto3
from datetime import datetime, timedelta
def get_sagemaker_training_costs(days=30):
"""SageMaker 학습 비용 조회"""
ce_client = boto3.client('ce')
end_date = datetime.now().date()
start_date = end_date - timedelta(days=days)
response = ce_client.get_cost_and_usage(
TimePeriod={
'Start': start_date.strftime('%Y-%m-%d'),
'End': end_date.strftime('%Y-%m-%d')
},
Granularity='DAILY',
Metrics=['UnblendedCost'],
Filter={
'Dimensions': {
'Key': 'SERVICE',
'Values': ['Amazon SageMaker']
}
},
GroupBy=[
{'Type': 'DIMENSION', 'Key': 'USAGE_TYPE'}
]
)
return response
def get_eks_serving_costs(cluster_name: str, days=30):
"""EKS 서빙 비용 조회"""
ce_client = boto3.client('ce')
end_date = datetime.now().date()
start_date = end_date - timedelta(days=days)
response = ce_client.get_cost_and_usage(
TimePeriod={
'Start': start_date.strftime('%Y-%m-%d'),
'End': end_date.strftime('%Y-%m-%d')
},
Granularity='DAILY',
Metrics=['UnblendedCost'],
Filter={
'And': [
{
'Dimensions': {
'Key': 'SERVICE',
'Values': ['Amazon Elastic Compute Cloud - Compute']
}
},
{
'Tags': {
'Key': 'kubernetes.io/cluster/' + cluster_name,
'Values': ['owned']
}
}
]
}
)
return response