From 28c709e4158d7ddc8a5242d53180bc86a5dd78d3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=EA=B9=80=EC=9A=A9=EC=88=98?= <akys159357@naver.com>
Date: Fri, 4 Oct 2024 11:36:21 +0900
Subject: [PATCH 1/2] =?UTF-8?q?Feat:=20GPU=20=EB=A9=94=EB=AA=A8=EB=A6=AC?=
 =?UTF-8?q?=20=EC=83=81=ED=83=9C=20=EC=9D=91=EB=8B=B5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ai/app/api/yolo/detection.py | 25 +++++++++++++++++++++++++
 ai/app/utils/file_utils.py   |  1 -
 ai/environment.yml           |  1 +
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/ai/app/api/yolo/detection.py b/ai/app/api/yolo/detection.py
index ab496a9..eef3c31 100644
--- a/ai/app/api/yolo/detection.py
+++ b/ai/app/api/yolo/detection.py
@@ -1,3 +1,6 @@
+import os
+
+import psutil
 from fastapi import APIRouter, HTTPException
 from schemas.predict_request import PredictRequest
 from schemas.train_request import TrainRequest, TrainDataInfo
@@ -234,3 +237,25 @@ def run_train(request, model, dataset_root_path):
         raise e
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"exception in run_train(): {e}")
+
+@router.get("/memory")
+async def get_memory_status():
+    # GPU 메모리 정보 가져오기 (torch.cuda 사용)
+    if torch.cuda.is_available():
+        # 현재 활성화된 CUDA 디바이스 번호 확인
+        current_device = torch.cuda.current_device()
+
+        total_gpu_memory = torch.cuda.get_device_properties(current_device).total_memory
+        allocated_gpu_memory = torch.cuda.memory_allocated(current_device)
+        reserved_gpu_memory = torch.cuda.memory_reserved(current_device)
+
+        gpu_memory = {
+            "current_device" : current_device,
+            "total": total_gpu_memory / (1024 ** 3),  # 전체 GPU 메모리 (GB 단위)
+            "allocated": allocated_gpu_memory / (1024 ** 3),  # 현재 사용 중인 GPU 메모리 (GB 단위)
+            "reserved": reserved_gpu_memory / (1024 ** 3),  # 예약된 GPU 메모리 (GB 단위)
+            "free": (total_gpu_memory - reserved_gpu_memory) / (1024 ** 3)  # 사용 가능한 GPU 메모리 (GB 단위)
+        }
+        return gpu_memory
+    else:
+        raise HTTPException(status_code=404, detail="GPU가 사용 가능하지 않습니다.")
\ No newline at end of file
diff --git a/ai/app/utils/file_utils.py b/ai/app/utils/file_utils.py
index 7cab7df..aa50973 100644
--- a/ai/app/utils/file_utils.py
+++ b/ai/app/utils/file_utils.py
@@ -43,7 +43,6 @@ def process_image_and_label(data:TrainDataInfo, dataset_root_path:str, child_pat
     """이미지 저장 및 레이블 파일 생성"""
     # 이미지 url로부터 파일명 분리
     img_name = data.image_url.split('/')[-1]
-
     img_path = os.path.join(dataset_root_path,child_path,img_name)
 
     # url로부터 이미지 다운로드
diff --git a/ai/environment.yml b/ai/environment.yml
index 4799f64..2161aa9 100644
--- a/ai/environment.yml
+++ b/ai/environment.yml
@@ -19,3 +19,4 @@ dependencies:
   - locust
   - websockets
   - httpx
+  - psutil

From 993b444c6ee6f2d5aed071704430386ea3002598 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=EA=B9=80=EC=9A=A9=EC=88=98?= <akys159357@naver.com>
Date: Fri, 4 Oct 2024 13:02:33 +0900
Subject: [PATCH 2/2] =?UTF-8?q?Feat:=20=EB=A9=94=EB=AA=A8=EB=A6=AC=20?=
 =?UTF-8?q?=ED=99=95=EC=9D=B8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ai/app/api/yolo/detection.py | 44 +++++++++++++++++++++++++++++++-----
 1 file changed, 38 insertions(+), 6 deletions(-)

diff --git a/ai/app/api/yolo/detection.py b/ai/app/api/yolo/detection.py
index eef3c31..b638f08 100644
--- a/ai/app/api/yolo/detection.py
+++ b/ai/app/api/yolo/detection.py
@@ -1,4 +1,5 @@
 import os
+import time
 
 import psutil
 from fastapi import APIRouter, HTTPException
@@ -111,10 +112,14 @@ async def detection_train(request: TrainRequest):
     send_slack_message(f"train 요청{request}", status="success")
 
     # 데이터셋 루트 경로 얻기 (프로젝트 id 기반)
+    
     dataset_root_path = get_dataset_root_path(request.project_id)
-
+    
     # 모델 로드
+    start_time = time.time()
+    print("모델 로드")
     model = get_model(request.project_id, request.m_key)
+    print(f'걸린 시간 {time.time() - start_time:.2f} 초')
 
     # 이 값을 학습할때 넣으면 이 카테고리들이 학습됨
     names = list(request.label_map)
@@ -125,23 +130,49 @@ async def detection_train(request: TrainRequest):
     # value : 모델에 저장될 카테고리 id (모델에는 key의 idx 순서대로 저장될 것임)
     
     # 데이터 전처리: 학습할 디렉토리 & 데이터셋 설정 파일을 생성
+    start_time = time.time()
+    print("데이터 전처리 : 학습할 디렉토리 및 데이터셋 설정 파일 생성")
     process_directories(dataset_root_path, names)
+    print(f'걸린 시간 {time.time() - start_time:.2f} 초')
 
     # 데이터 전처리: 데이터를 학습데이터와 검증데이터로 분류
+    start_time = time.time()
+    print("데이터 전처리 : 데이터 분류")
     train_data, val_data = split_data(request.data, request.ratio)
-    
+    print(f'걸린 시간 {time.time() - start_time:.2f} 초')
 
     # 데이터 전처리: 데이터 이미지 및 레이블 다운로드
+    start_time = time.time()
+    print("데이터 전처리 : 데이터 다운로드")
     download_data(train_data, val_data, dataset_root_path, label_converter)
-
+    print(f'걸린 시간 {time.time() - start_time:.2f} 초')
+    
     # 학습
+    start_time = time.time()
+    print("학습 시작")
     results = run_train(request, model,dataset_root_path)
+    print(f'걸린 시간 {time.time() - start_time:.2f} 초')
+
+    # 학습 후 GPU 메모리 상태 확인
+    if torch.cuda.is_available():
+        allocated_memory = torch.cuda.memory_allocated() / (1024 ** 2)  # MB 단위
+        reserved_memory = torch.cuda.memory_reserved() / (1024 ** 2)  # MB 단위
+        print(f"현재 할당된 GPU 메모리: {allocated_memory:.2f} MB")
+        print(f"현재 예약된 GPU 메모리: {reserved_memory:.2f} MB")
+    else:
+        print("GPU 사용 불가능")
+    torch.cuda.empty_cache()
 
     # best 모델 저장
+    start_time = time.time()
+    print("모델 저장")
     model_key = save_model(project_id=request.project_id, path=join_path(dataset_root_path, "result", "weights", "best.pt"))
-    
-    result = results.results_dict
+    print(f'걸린 시간 {time.time() - start_time:.2f} 초')
 
+    print("변환")
+    result = results.results_dict
+    print(f'걸린 시간 {time.time() - start_time:.2f} 초')
+    
     response = TrainResponse(
         modelKey=model_key,
         precision= result["metrics/precision(B)"],
@@ -152,7 +183,8 @@ async def detection_train(request: TrainRequest):
         fitness= result["fitness"]
     )
     send_slack_message(f"train 성공{response}", status="success")
-        
+
+    print(response)
     return response
 
 def split_data(data:list[TrainDataInfo], ratio:float):