Feat: 메모리 확인

2024-10-04 13:02:33 +09:00 · 2024-10-04 13:02:33 +09:00 · 993b444c6e
commit 993b444c6e
parent 28c709e415
1 changed files with 38 additions and 6 deletions
--- a/ai/app/api/yolo/detection.py
+++ b/ai/app/api/yolo/detection.py
@ -1,4 +1,5 @@
 import os
+import time

 import psutil
 from fastapi import APIRouter, HTTPException
@ -111,10 +112,14 @@ async def detection_train(request: TrainRequest):
    send_slack_message(f"train 요청{request}", status="success")

    # 데이터셋 루트 경로 얻기 (프로젝트 id 기반)
+    
    dataset_root_path = get_dataset_root_path(request.project_id)
-
+    
    # 모델 로드
+    start_time = time.time()
+    print("모델 로드")
    model = get_model(request.project_id, request.m_key)
+    print(f'걸린 시간 {time.time() - start_time:.2f} 초')

    # 이 값을 학습할때 넣으면 이 카테고리들이 학습됨
    names = list(request.label_map)
@ -125,23 +130,49 @@ async def detection_train(request: TrainRequest):
    # value : 모델에 저장될 카테고리 id (모델에는 key의 idx 순서대로 저장될 것임)
    
    # 데이터 전처리: 학습할 디렉토리 & 데이터셋 설정 파일을 생성
+    start_time = time.time()
+    print("데이터 전처리 : 학습할 디렉토리 및 데이터셋 설정 파일 생성")
    process_directories(dataset_root_path, names)
+    print(f'걸린 시간 {time.time() - start_time:.2f} 초')

    # 데이터 전처리: 데이터를 학습데이터와 검증데이터로 분류
+    start_time = time.time()
+    print("데이터 전처리 : 데이터 분류")
    train_data, val_data = split_data(request.data, request.ratio)
-    
+    print(f'걸린 시간 {time.time() - start_time:.2f} 초')

    # 데이터 전처리: 데이터 이미지 및 레이블 다운로드
+    start_time = time.time()
+    print("데이터 전처리 : 데이터 다운로드")
    download_data(train_data, val_data, dataset_root_path, label_converter)
-
+    print(f'걸린 시간 {time.time() - start_time:.2f} 초')
+    
    # 학습
+    start_time = time.time()
+    print("학습 시작")
    results = run_train(request, model,dataset_root_path)
+    print(f'걸린 시간 {time.time() - start_time:.2f} 초')
+
+    # 학습 후 GPU 메모리 상태 확인
+    if torch.cuda.is_available():
+        allocated_memory = torch.cuda.memory_allocated() / (1024 ** 2)  # MB 단위
+        reserved_memory = torch.cuda.memory_reserved() / (1024 ** 2)  # MB 단위
+        print(f"현재 할당된 GPU 메모리: {allocated_memory:.2f} MB")
+        print(f"현재 예약된 GPU 메모리: {reserved_memory:.2f} MB")
+    else:
+        print("GPU 사용 불가능")
+    torch.cuda.empty_cache()

    # best 모델 저장
+    start_time = time.time()
+    print("모델 저장")
    model_key = save_model(project_id=request.project_id, path=join_path(dataset_root_path, "result", "weights", "best.pt"))
-    
-    result = results.results_dict
+    print(f'걸린 시간 {time.time() - start_time:.2f} 초')

+    print("변환")
+    result = results.results_dict
+    print(f'걸린 시간 {time.time() - start_time:.2f} 초')
+    
    response = TrainResponse(
        modelKey=model_key,
        precision= result["metrics/precision(B)"],
@ -152,7 +183,8 @@ async def detection_train(request: TrainRequest):
        fitness= result["fitness"]
    )
    send_slack_message(f"train 성공{response}", status="success")
-        
+
+    print(response)
    return response

 def split_data(data:list[TrainDataInfo], ratio:float):