음성 처리 통합

개요

Clova Studio의 STT (Speech-to-Text)와 TTS (Text-to-Speech) API를 사용하여 음성 기반 애플리케이션을 구축하는 방법을 다룹니다.

음성-텍스트 변환 (STT)

기본 STT 구현

import requests
from typing import Optional

class ClovaSTT:
    def __init__(self, api_key: str):
        self.api_key = api_key
        self.base_url = "https://api.clovastudio.go.kr/v1"
    
    def transcribe(self, audio_file_path: str, language: str = "ko-KR") -> str:
        """
        음성 파일을 텍스트로 변환
        
        Args:
            audio_file_path: 음성 파일 경로 (wav, mp3, m4a 등)
            language: 언어 코드 (ko-KR, en-US 등)
        
        Returns:
            변환된 텍스트
        """
        with open(audio_file_path, 'rb') as audio_file:
            files = {'file': audio_file}
            data = {'language': language}
            headers = {"Authorization": f"Bearer {self.api_key}"}
            
            response = requests.post(
                f"{self.base_url}/tools/stt",
                headers=headers,
                files=files,
                data=data
            )
            response.raise_for_status()
            
            result = response.json()
            return result['text']
    
    def transcribe_with_timestamps(self, audio_file_path: str, 
                                   language: str = "ko-KR") -> dict:
        """타임스탬프와 함께 변환"""
        with open(audio_file_path, 'rb') as audio_file:
            files = {'file': audio_file}
            data = {
                'language': language,
                'include_timestamps': True
            }
            headers = {"Authorization": f"Bearer {self.api_key}"}
            
            response = requests.post(
                f"{self.base_url}/tools/stt",
                headers=headers,
                files=files,
                data=data
            )
            response.raise_for_status()
            
            return response.json()

# 사용 예제
stt = ClovaSTT("YOUR_API_KEY")
text = stt.transcribe("./audio.wav", language="ko-KR")
print(f"변환된 텍스트: {text}")

텍스트-음성 변환 (TTS)

기본 TTS 구현

class ClovaTTS:
    def __init__(self, api_key: str):
        self.api_key = api_key
        self.base_url = "https://api.clovastudio.go.kr/v1"
        self.headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json"
        }
    
    def synthesize(self, text: str, output_path: str,
                  voice: str = "nara",
                  speed: float = 1.0,
                  pitch: float = 1.0) -> str:
        """
        텍스트를 음성으로 변환
        
        Args:
            text: 변환할 텍스트
            output_path: 출력 파일 경로
            voice: 음성 종류 (nara, jinho 등)
            speed: 속도 (0.5~2.0)
            pitch: 음높이 (0.5~2.0)
        
        Returns:
            출력 파일 경로
        """
        data = {
            "text": text,
            "voice": voice,
            "speed": speed,
            "pitch": pitch,
            "format": "wav"
        }
        
        response = requests.post(
            f"{self.base_url}/tools/tts",
            headers=self.headers,
            json=data
        )
        response.raise_for_status()
        
        # 음성 데이터 저장
        with open(output_path, 'wb') as f:
            f.write(response.content)
        
        return output_path
    
    def get_available_voices(self) -> list:
        """사용 가능한 음성 목록 조회"""
        response = requests.get(
            f"{self.base_url}/tools/tts/voices",
            headers=self.headers
        )
        response.raise_for_status()
        
        return response.json()['voices']

# 사용 예제
tts = ClovaTTS("YOUR_API_KEY")
tts.synthesize(
    text="안녕하세요. Clova Studio입니다.",
    output_path="./output.wav",
    voice="nara",
    speed=1.0
)

음성 챗봇 구축

STT, Chat, TTS를 결합한 완전한 음성 챗봇

class VoiceChatBot:
    def __init__(self, api_key: str):
        self.stt = ClovaSTT(api_key)
        self.tts = ClovaTTS(api_key)
        self.api_key = api_key
        self.base_url = "https://api.clovastudio.go.kr/v1"
        self.headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json"
        }
        self.conversation_history = []
    
    def process_voice_input(self, audio_file_path: str, 
                           output_audio_path: str) -> tuple[str, str, str]:
        """
        음성 입력을 받아 응답 음성 생성
        
        Args:
            audio_file_path: 입력 음성 파일
            output_audio_path: 출력 음성 파일
        
        Returns:
            (인식된 텍스트, AI 응답 텍스트, 출력 음성 파일 경로)
        """
        print("🎤 음성 인식 중...")
        # 1. STT: 음성을 텍스트로 변환
        user_text = self.stt.transcribe(audio_file_path)
        print(f"📝 인식됨: {user_text}")
        
        print("🤔 응답 생성 중...")
        # 2. Chat: 텍스트 응답 생성
        self.conversation_history.append({
            "role": "user",
            "content": user_text
        })
        
        response = requests.post(
            f"{self.base_url}/chat/completions",
            headers=self.headers,
            json={
                "model": "clova-x",
                "messages": self.conversation_history
            }
        )
        response.raise_for_status()
        
        ai_response = response.json()['choices'][0]['message']['content']
        print(f"💬 AI 응답: {ai_response}")
        
        self.conversation_history.append({
            "role": "assistant",
            "content": ai_response
        })
        
        print("🔊 음성 합성 중...")
        # 3. TTS: 응답을 음성으로 변환
        self.tts.synthesize(ai_response, output_audio_path)
        print(f"✅ 음성 생성 완료: {output_audio_path}")
        
        return user_text, ai_response, output_audio_path
    
    def reset_conversation(self):
        """대화 히스토리 초기화"""
        self.conversation_history = []

# 사용 예제
bot = VoiceChatBot("YOUR_API_KEY")

user_text, ai_text, audio_path = bot.process_voice_input(
    audio_file_path="./user_question.wav",
    output_audio_path="./ai_response.wav"
)

print(f"\n사용자: {user_text}")
print(f"AI: {ai_text}")
print(f"응답 음성: {audio_path}")

실시간 스트리밍 음성 처리

import pyaudio
import wave
import threading

class RealtimeVoiceChat:
    def __init__(self, api_key: str):
        self.bot = VoiceChatBot(api_key)
        self.is_recording = False
        self.audio = pyaudio.PyAudio()
    
    def record_audio(self, output_path: str, duration: int = 5):
        """마이크로부터 오디오 녹음"""
        CHUNK = 1024
        FORMAT = pyaudio.paInt16
        CHANNELS = 1
        RATE = 16000
        
        stream = self.audio.open(
            format=FORMAT,
            channels=CHANNELS,
            rate=RATE,
            input=True,
            frames_per_buffer=CHUNK
        )
        
        print("🎙️ 녹음 중... ({}초)".format(duration))
        frames = []
        
        for _ in range(0, int(RATE / CHUNK * duration)):
            data = stream.read(CHUNK)
            frames.append(data)
        
        print("✅ 녹음 완료")
        
        stream.stop_stream()
        stream.close()
        
        # WAV 파일로 저장
        wf = wave.open(output_path, 'wb')
        wf.setnchannels(CHANNELS)
        wf.setsampwidth(self.audio.get_sample_size(FORMAT))
        wf.setframerate(RATE)
        wf.writeframes(b''.join(frames))
        wf.close()
        
        return output_path
    
    def play_audio(self, file_path: str):
        """음성 파일 재생"""
        wf = wave.open(file_path, 'rb')
        
        stream = self.audio.open(
            format=self.audio.get_format_from_width(wf.getsampwidth()),
            channels=wf.getnchannels(),
            rate=wf.getframerate(),
            output=True
        )
        
        print("🔊 재생 중...")
        data = wf.readframes(1024)
        while data:
            stream.write(data)
            data = wf.readframes(1024)
        
        stream.stop_stream()
        stream.close()
        wf.close()
        print("✅ 재생 완료")
    
    def interactive_session(self):
        """대화형 세션 실행"""
        print("🎤 음성 챗봇 시작! (종료하려면 'q' 입력)")
        
        session_count = 0
        while True:
            command = input("\n[Enter]를 눌러 말하기 시작 ('q'=종료): ")
            
            if command.lower() == 'q':
                break
            
            # 녹음
            input_file = f"./temp_input_{session_count}.wav"
            output_file = f"./temp_output_{session_count}.wav"
            
            self.record_audio(input_file, duration=5)
            
            # 처리
            try:
                user_text, ai_text, audio_path = self.bot.process_voice_input(
                    input_file, output_file
                )
                
                # 응답 재생
                self.play_audio(audio_path)
                
                session_count += 1
                
            except Exception as e:
                print(f"❌ 오류 발생: {e}")
        
        self.audio.terminate()
        print("👋 음성 챗봇 종료")

# 사용 예제
if __name__ == "__main__":
    chat = RealtimeVoiceChat("YOUR_API_KEY")
    chat.interactive_session()

음성 기반 RAG 시스템

음성으로 문서 검색 및 질의응답

class VoiceRAGSystem:
    def __init__(self, api_key: str, collection_id: str):
        self.stt = ClovaSTT(api_key)
        self.tts = ClovaTTS(api_key)
        self.api_key = api_key
        self.base_url = "https://api.clovastudio.go.kr/v1"
        self.collection_id = collection_id
        self.headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json"
        }
    
    def voice_query(self, audio_file: str, output_audio: str) -> dict:
        """음성으로 문서 검색 및 답변"""
        # 1. 음성을 텍스트로
        query = self.stt.transcribe(audio_file)
        print(f"🔍 질문: {query}")
        
        # 2. 문서 검색
        search_response = requests.post(
            f"{self.base_url}/rag42/search",
            headers=self.headers,
            json={
                "collection_id": self.collection_id,
                "query": query,
                "top_k": 3
            }
        )
        docs = search_response.json()['results']
        
        # 3. 답변 생성
        context = "\n\n".join([doc['content'] for doc in docs])
        
        chat_response = requests.post(
            f"{self.base_url}/chat/completions",
            headers=self.headers,
            json={
                "model": "clova-x",
                "messages": [
                    {
                        "role": "system",
                        "content": "제공된 문서를 바탕으로 정확하게 답변하세요."
                    },
                    {
                        "role": "user",
                        "content": f"문서:\n{context}\n\n질문: {query}"
                    }
                ]
            }
        )
        
        answer = chat_response.json()['choices'][0]['message']['content']
        print(f"💬 답변: {answer}")
        
        # 4. 답변을 음성으로
        self.tts.synthesize(answer, output_audio)
        
        return {
            "query": query,
            "answer": answer,
            "sources": docs,
            "audio_path": output_audio
        }

# 사용 예제
rag_voice = VoiceRAGSystem("YOUR_API_KEY", "collection_id_here")
result = rag_voice.voice_query(
    audio_file="./question.wav",
    output_audio="./answer.wav"
)

다국어 음성 처리

class MultilingualVoiceBot:
    def __init__(self, api_key: str):
        self.stt = ClovaSTT(api_key)
        self.tts = ClovaTTS(api_key)
        self.api_key = api_key
        self.base_url = "https://api.clovastudio.go.kr/v1"
        self.headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json"
        }
    
    def detect_language(self, text: str) -> str:
        """텍스트 언어 감지"""
        # 간단한 예제 (실제로는 더 정교한 감지 필요)
        if any('\uac00' <= char <= '\ud7a3' for char in text):
            return "ko"
        return "en"
    
    def translate_and_respond(self, audio_file: str, target_lang: str,
                             output_audio: str) -> dict:
        """다국어 음성 처리"""
        # 1. STT로 텍스트 변환
        source_text = self.stt.transcribe(audio_file)
        source_lang = self.detect_language(source_text)
        
        # 2. 번역 (필요시)
        if source_lang != target_lang:
            # 번역 API 호출 (가정)
            translated_text = self._translate(source_text, target_lang)
        else:
            translated_text = source_text
        
        # 3. 응답 생성
        # ... Chat API 호출
        
        # 4. TTS로 음성 생성
        voice = "nara" if target_lang == "ko" else "clara"
        self.tts.synthesize(translated_text, output_audio, voice=voice)
        
        return {
            "source_lang": source_lang,
            "target_lang": target_lang,
            "source_text": source_text,
            "translated_text": translated_text
        }

베스트 프랙티스

오디오 품질

16kHz 샘플링 레이트 사용
노이즈 캔슬링 적용
적절한 볼륨 레벨 유지

에러 처리

네트워크 오류 시 재시도
음성 인식 실패 시 사용자에게 재요청
타임아웃 설정

성능 최적화

오디오 파일 압축
캐싱 활용
비동기 처리

다음 단계

Chat 애플리케이션

텍스트 기반 챗봇 구축

STT API

STT API 상세 문서

TTS API

TTS API 상세 문서

Cookbook

웹 활용 예제

API 활용 예제

개요

음성-텍스트 변환 (STT)

기본 STT 구현

텍스트-음성 변환 (TTS)

기본 TTS 구현

음성 챗봇 구축

실시간 스트리밍 음성 처리

음성 기반 RAG 시스템

다국어 음성 처리

베스트 프랙티스

다음 단계

Chat 애플리케이션

STT API

TTS API

Cookbook

웹 활용 예제

API 활용 예제

​개요

​음성-텍스트 변환 (STT)

​기본 STT 구현

​텍스트-음성 변환 (TTS)

​기본 TTS 구현

​음성 챗봇 구축

​실시간 스트리밍 음성 처리

​음성 기반 RAG 시스템

​다국어 음성 처리

​베스트 프랙티스

​다음 단계

Chat 애플리케이션

STT API

TTS API

개요

음성-텍스트 변환 (STT)

기본 STT 구현

텍스트-음성 변환 (TTS)

기본 TTS 구현

음성 챗봇 구축

실시간 스트리밍 음성 처리

음성 기반 RAG 시스템

다국어 음성 처리

베스트 프랙티스

다음 단계