game-fe-agent/skills/requirement-analyzer/scripts/extract_pptx.py

"""
PPT 기획서 추출 스크립트 (extract_pptx.py)
========================================
PPTX 파일을 파싱하여 Claude AI가 분석할 수 있는 JSON 구조로 변환합니다.
의미 분석(화면 매핑, API 추출, 플로우 추론)은 Claude AI가 담당합니다.

사용법:
  python extract_pptx.py <pptx경로> [옵션]

옵션:
  --extract-images          슬라이드 이미지를 PNG로 추출 (--output-dir 로 저장 경로 지정)
  --output-dir <dir>        이미지 추출 디렉토리 (기본: /tmp/pptx_<파일명>/)
  --slides <범위>            처리할 슬라이드 범위 (예: 1-10, 5, 3-7,10-12)
  --auto-install            python-pptx 자동 설치 후 실행
  --pretty                  JSON 출력 시 들여쓰기 적용

출력:
  stdout 에 JSON 데이터 출력
"""
import sys
import os
import json
import argparse
import tempfile
import re


# ─────────────────────────────────────────────
# 의존성 확인 및 자동 설치
# ─────────────────────────────────────────────

def _ensure_pptx(auto_install: bool) -> None:
    """python-pptx 설치 여부 확인. 미설치 시 안내 또는 자동 설치."""
    try:
        import pptx  # noqa: F401
    except ImportError:
        if auto_install:
            import subprocess
            print("📦 python-pptx 설치 중...", file=sys.stderr)
            subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'python-pptx'])
            print("✅ python-pptx 설치 완료\n", file=sys.stderr)
        else:
            print(
                "❌ python-pptx 패키지가 설치되어 있지 않습니다.\n"
                "\n"
                "설치 명령어:\n"
                "  pip3 install python-pptx\n"
                "\n"
                "또는 자동 설치 옵션을 사용하세요:\n"
                f"  python3 {sys.argv[0]} --auto-install <파일경로>",
                file=sys.stderr,
            )
            sys.exit(1)


# ─────────────────────────────────────────────
# 슬라이드 범위 파싱
# ─────────────────────────────────────────────

def parse_slide_range(spec: str, total: int) -> set[int]:
    """
    "1-5,8,10-12" 형태의 슬라이드 범위 문자열을 슬라이드 번호 set 으로 변환.
    번호는 1-based.
    """
    result: set[int] = set()
    for part in spec.split(','):
        part = part.strip()
        if '-' in part:
            start, end = part.split('-', 1)
            result.update(range(int(start), int(end) + 1))
        else:
            result.add(int(part))
    return {n for n in result if 1 <= n <= total}


# ─────────────────────────────────────────────
# 도형(Shape) 분류
# ─────────────────────────────────────────────

def _shape_type_name(shape) -> str:
    """python-pptx MSO_SHAPE_TYPE 값을 사람이 읽기 쉬운 문자열로 변환."""
    from pptx.util import Emu  # noqa: F401 — 모듈 로드 확인용
    try:
        return shape.shape_type.name.lower()  # e.g. 'auto_shape', 'picture', 'line'
    except Exception:
        return 'unknown'


def _is_connector(shape) -> bool:
    """화살표/커넥터 도형 여부 확인."""
    try:
        # MSO_SHAPE_TYPE.LINE = 9, FREEFORM = 5
        return shape.shape_type in (9,)
    except Exception:
        return False


# ─────────────────────────────────────────────
# 슬라이드 데이터 추출
# ─────────────────────────────────────────────

def extract_slide(slide, slide_number: int, extract_images: bool, output_dir: str) -> dict:
    """단일 슬라이드에서 모든 관련 데이터를 추출하여 dict 반환."""
    from pptx.enum.shapes import PP_PLACEHOLDER  # noqa: F401

    result: dict = {
        'number': slide_number,
        'title': '',
        'texts': [],
        'notes': '',
        'images': [],
        'shapes': [],
        'tables': [],
    }

    # ── 제목 추출 ──────────────────────────────
    for shape in slide.shapes:
        try:
            if shape.is_placeholder:
                ph_type = shape.placeholder_format.type
                # PP_PLACEHOLDER.TITLE = 1, CENTER_TITLE = 3
                if ph_type in (1, 3):
                    result['title'] = shape.text.strip()
                    break
        except Exception:
            pass

    # ── 모든 도형 순회 ──────────────────────────
    for shape in slide.shapes:
        shape_info: dict = {
            'type': _shape_type_name(shape),
            'name': getattr(shape, 'name', ''),
            'left': int(shape.left or 0),
            'top': int(shape.top or 0),
            'width': int(shape.width or 0),
            'height': int(shape.height or 0),
            'text': '',
        }

        # 텍스트 프레임
        if shape.has_text_frame:
            text = shape.text_frame.text.strip()
            shape_info['text'] = text
            if text:
                result['texts'].append({
                    'text': text,
                    'left': shape_info['left'],
                    'top': shape_info['top'],
                    'width': shape_info['width'],
                    'height': shape_info['height'],
                    'shape_name': shape_info['name'],
                })

        # 테이블
        if shape.has_table:
            table = shape.table
            all_rows = list(table.rows)
            headers = [cell.text.strip() for cell in all_rows[0].cells]
            rows = [
                [cell.text.strip() for cell in row.cells]
                for row in all_rows[1:]
            ]
            result['tables'].append({'headers': headers, 'rows': rows})
            shape_info['type'] = 'table'

        # 이미지
        if shape.shape_type == 13:  # MSO_SHAPE_TYPE.PICTURE = 13
            img_info: dict = {
                'name': shape.name,
                'left': shape_info['left'],
                'top': shape_info['top'],
                'width': shape_info['width'],
                'height': shape_info['height'],
                'path': '',
            }
            if extract_images:
                try:
                    img_bytes = shape.image.blob
                    ext = shape.image.ext  # e.g. 'png', 'jpeg'
                    safe_name = re.sub(r'[^\w\-.]', '_', shape.name)
                    img_filename = f"slide{slide_number:03d}_{safe_name}.{ext}"
                    img_path = os.path.join(output_dir, img_filename)
                    os.makedirs(output_dir, exist_ok=True)
                    with open(img_path, 'wb') as f:
                        f.write(img_bytes)
                    img_info['path'] = img_path
                except Exception as e:
                    img_info['error'] = str(e)
            result['images'].append(img_info)
            shape_info['type'] = 'picture'

        # 커넥터/화살표 처리
        if _is_connector(shape):
            shape_info['type'] = 'connector'

        result['shapes'].append(shape_info)

    # ── 발표자 노트 ────────────────────────────
    try:
        if slide.has_notes_slide:
            notes_text = slide.notes_slide.notes_text_frame.text.strip()
            result['notes'] = notes_text
    except Exception:
        pass

    return result


# ─────────────────────────────────────────────
# 메인 추출 함수
# ─────────────────────────────────────────────

def extract_pptx(
    filepath: str,
    extract_images: bool = False,
    output_dir: str = '',
    slide_range: str = '',
    pretty: bool = False,
) -> None:
    """
    PPTX 파일 전체를 파싱하여 JSON 구조를 stdout 에 출력합니다.

    Args:
        filepath: PPTX 파일 절대/상대 경로
        extract_images: True 이면 슬라이드 이미지를 output_dir 에 PNG로 추출
        output_dir: 이미지 추출 디렉토리 (기본: /tmp/pptx_<파일명>/)
        slide_range: 처리할 슬라이드 범위 문자열 (예: "1-10", "" = 전체)
        pretty: True 이면 JSON 들여쓰기 출력
    """
    from pptx import Presentation

    if not os.path.exists(filepath):
        print(f"❌ 파일을 찾을 수 없습니다: {filepath}", file=sys.stderr)
        sys.exit(1)

    prs = Presentation(filepath)
    total_slides = len(prs.slides)
    filename = os.path.basename(filepath)

    # 출력 디렉토리 결정
    if not output_dir:
        stem = re.sub(r'[^\w\-]', '_', os.path.splitext(filename)[0])
        output_dir = os.path.join(tempfile.gettempdir(), f'pptx_{stem}')

    # 처리 대상 슬라이드 번호 결정
    if slide_range:
        target_slides = parse_slide_range(slide_range, total_slides)
    else:
        target_slides = set(range(1, total_slides + 1))

    print(
        f"🔍 파싱 중: {filename} ({total_slides}장 중 {len(target_slides)}장 처리)",
        file=sys.stderr,
    )
    if extract_images:
        print(f"🖼️  이미지 추출 디렉토리: {output_dir}", file=sys.stderr)

    # 슬라이드 추출
    slides_data = []
    for idx, slide in enumerate(prs.slides, start=1):
        if idx not in target_slides:
            continue
        slide_data = extract_slide(slide, idx, extract_images, output_dir)
        slides_data.append(slide_data)
        print(f"  슬라이드 {idx}/{total_slides}: {slide_data['title'] or '(제목 없음)'}", file=sys.stderr)

    output = {
        'filename': filename,
        'filepath': os.path.abspath(filepath),
        'total_slides': total_slides,
        'processed_slides': len(slides_data),
        'image_output_dir': output_dir if extract_images else '',
        'slides': slides_data,
    }

    indent = 2 if pretty else None
    print(json.dumps(output, ensure_ascii=False, indent=indent))
    print(f"\n✅ 추출 완료: {len(slides_data)}개 슬라이드", file=sys.stderr)


# ─────────────────────────────────────────────
# CLI 진입점
# ─────────────────────────────────────────────

def main() -> None:
    parser = argparse.ArgumentParser(
        description='PPTX 기획서를 JSON 구조로 추출합니다.',
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    parser.add_argument('filepath', nargs='?', help='PPTX 파일 경로')
    parser.add_argument('--extract-images', action='store_true', help='슬라이드 이미지를 PNG로 추출')
    parser.add_argument('--output-dir', default='', help='이미지 추출 디렉토리')
    parser.add_argument('--slides', default='', help='처리할 슬라이드 범위 (예: 1-10, 5, 3-7,10)')
    parser.add_argument('--auto-install', action='store_true', help='python-pptx 자동 설치')
    parser.add_argument('--pretty', action='store_true', help='JSON 들여쓰기 출력')
    args = parser.parse_args()

    _ensure_pptx(args.auto_install)

    if not args.filepath:
        parser.print_help()
        print("\n❌ PPTX 파일 경로를 입력해 주세요.", file=sys.stderr)
        sys.exit(1)

    extract_pptx(
        filepath=args.filepath,
        extract_images=args.extract_images,
        output_dir=args.output_dir,
        slide_range=args.slides,
        pretty=args.pretty,
    )


if __name__ == '__main__':
    main()