Making language models imitate a character is called persona tuning
. I’m going to see if I can fine-tune an sLM to imitate a character I like and have a real-time conversation.
Crawling Conversation Data From Gameplay Footage
- I don’t want to create fine-tuning dataset by hand, so I will extract raw dialogue text from recorded gameplay footage and ask the LLM to finish the incomplete parts.
We need to use PaddleOCRv4, since Korean isn’t supported for v5 yet.
# pyproject.toml
[build-system]
requires = ["setuptools>=61.0", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "game-text-extractor"
version = "0.1.0"
description = "Video text extraction tool using PaddleOCR"
requires-python = ">=3.8"
dependencies = [
"opencv-python>=4.8.0",
"paddlepaddle==3.0.0",
"paddleocr<3.0.0",
"numpy==2.3.1"
]
[project.scripts]
extract-game-text = "game_text_extractor.extract_game_text:main"
[tool.setuptools.packages.find]
where = ["."]
include = ["game_text_extractor*"]
"""Extracts text from video frames in given interval using PaddleOCR.
Usage example:
python extract_game_text.py --video_path ./ep1-2.mp4 --output_dir ./output --frame_interval 5
"""
import cv2
import os
from paddleocr import PaddleOCR
from datetime import datetime
import argparse
import json
from pathlib import Path
def parse_args():
parser = argparse.ArgumentParser(description='Video Text Extractor')
parser.add_argument('--video_path', type=str, required=True, help='Path to the video file')
parser.add_argument('--output_dir', type=str, default='extracted_frames', help='Output directory for extracted frames')
parser.add_argument('--frame_interval', type=int, default=5, help='Frame interval for text extraction')
return parser.parse_args()
class VideoTextExtractor:
def __init__(self, confidence_threshold=0.8):
self.ocr = PaddleOCR(lang='korean', show_log=False)
self.confidence_threshold = confidence_threshold
def extract_frames_and_text(self, video_path, frame_interval, output_dir):
"""영상에서 텍스트 추출"""
os.makedirs(output_dir, exist_ok=True)
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
print(f"Error: 비디오 파일을 열 수 없습니다: {video_path}")
return []
# 비디오 정보
fps = cap.get(cv2.CAP_PROP_FPS)
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
duration = total_frames / fps
print(f"비디오 정보:")
print(f"- FPS: {fps}")
print(f"- 총 프레임: {total_frames}")
print(f"- 길이: {duration:.2f}초")
print(f"- {frame_interval}프레임마다 추출 예정")
extracted_texts = []
frame_count = 0
saved_count = 0
while True:
ret, frame = cap.read()
if not ret:
break
if frame_count % frame_interval == 0:
timestamp = frame_count / fps
frame_filename = f"frame_{saved_count:06d}_t{timestamp:.2f}s.jpg"
frame_path = os.path.join(output_dir, frame_filename)
cv2.imwrite(frame_path, frame)
try:
result = self.ocr.ocr(frame_path, cls=True)
frame_texts = []
if result[0]:
for line in result[0]:
text = line[1][0]
confidence = line[1][1]
bbox = line[0]
if confidence >= self.confidence_threshold:
frame_texts.append(text)
extracted_data = {
'texts': frame_texts,
}
extracted_texts.append(extracted_data)
if frame_texts:
print(f"Frame {saved_count} ({timestamp:.2f}s): {len(frame_texts)}개 텍스트 발견")
for text_info in frame_texts:
print(f" - {text_info['text']} (신뢰도: {text_info['confidence']:.2f})")
else:
print(f"Frame {saved_count} ({timestamp:.2f}s): 텍스트 없음")
saved_count += 1
except Exception as e:
print(f"OCR 처리 오류 (Frame {saved_count}): {e}")
frame_count += 1
cap.release()
result_file = os.path.join(output_dir, "extraction_results.json")
with open(result_file, 'w', encoding='utf-8') as f:
json.dump(extracted_texts, f, ensure_ascii=False, indent=2)
print(f"\n추출 완료!")
print(f"- 총 {saved_count}개 프레임 처리")
print(f"- 결과 저장 위치: {result_file}")
return extracted_texts
if __name__ == "__main__":
args = parse_args()
extractor = VideoTextExtractor(confidence_threshold=0.8)
results = extractor.extract_frames_and_text(**vars(args))
This script extracts frames from the video in given interval, and makes PaddleOCR extract the texts.