From 418e814d17029c023e7a5dd2ff568d553ff47aa8 Mon Sep 17 00:00:00 2001 From: staru09 Date: Fri, 11 Jul 2025 22:59:28 +0530 Subject: [PATCH 1/7] simpler LLM operations --- .gitignore | 6 ++++- app/config.py | 16 +++++++++++++ app/services/llm_service.py | 47 +++++++++++++++++++++++++++++++++++++ app/transcript.py | 33 +++++++++++++++++++++++++- app/transcription.py | 33 ++++++++++++++++++++++++++ config.ini.example | 5 ++++ requirements.txt | 2 +- routes/transcription.py | 6 +++++ transcriber.py | 30 +++++++++++++++++++++-- 9 files changed, 173 insertions(+), 5 deletions(-) create mode 100644 app/services/llm_service.py diff --git a/.gitignore b/.gitignore index c24424b..5799ee3 100644 --- a/.gitignore +++ b/.gitignore @@ -133,4 +133,8 @@ dmypy.json # Pyre type checker .pyre/ .username -.idea \ No newline at end of file +.idea +output/ +test_feats/ +logs/ +metadata/ \ No newline at end of file diff --git a/app/config.py b/app/config.py index 576c077..3198241 100644 --- a/app/config.py +++ b/app/config.py @@ -81,6 +81,22 @@ def GITHUB_INSTALLATION_ID(self): return self._get_env_variable('GITHUB_INSTALLATION_ID', "To use GitHub App integration, you need to define a 'GITHUB_INSTALLATION_ID' in your .env file") + @property + def LLM_PROVIDER(self): + return os.getenv('LLM_PROVIDER', 'openai') + + @property + def OPENAI_API_KEY(self): + return self._get_env_variable('OPENAI_API_KEY') + + @property + def GOOGLE_API_KEY(self): + return self._get_env_variable('GOOGLE_API_KEY') + + @property + def CLAUDE_API_KEY(self): + return self._get_env_variable('CLAUDE_API_KEY') + # Initialize the Settings class and expose an instance settings = Settings() diff --git a/app/services/llm_service.py b/app/services/llm_service.py new file mode 100644 index 0000000..797c3a1 --- /dev/null +++ b/app/services/llm_service.py @@ -0,0 +1,47 @@ +from abc import ABC, abstractmethod +import openai +from app.config import settings + +class BaseLLM(ABC): + @abstractmethod + def correct_transcript(self, transcript_text: str, model: str) -> str: + pass + + @abstractmethod + def summarize_text(self, text: str, model: str) -> str: + pass + +class OpenAI_LLM(BaseLLM): + def __init__(self, api_key): + openai.api_key = api_key + + def correct_transcript(self, transcript_text: str, model: str) -> str: + prompt = f"""Please correct the following transcript for punctuation, grammar, and spelling. Do not change the content. + --- + {transcript_text}""" + + response = openai.chat.completions.create( + model=model, + messages=[{"role": "user", "content": prompt}] + ) + return response.choices[0].message.content + + def summarize_text(self, text: str, model: str) -> str: + prompt = f"""Please summarize the following text. + --- + {text}""" + + response = openai.chat.completions.create( + model=model, + messages=[{"role": "user", "content": prompt}] + ) + return response.choices[0].message.content + +class LLMFactory: + @staticmethod + def create_llm_service(provider: str) -> BaseLLM: + if provider == "openai": + return OpenAI_LLM(api_key=settings.OPENAI_API_KEY) + # Add other providers here + else: + raise ValueError(f"Unsupported LLM provider: {provider}") diff --git a/app/transcript.py b/app/transcript.py index f29f08e..46e1efe 100644 --- a/app/transcript.py +++ b/app/transcript.py @@ -72,6 +72,10 @@ def title(self): def summary(self): return self.source.summary + @summary.setter + def summary(self, value): + self.source.summary = value + def __str__(self): excluded_fields = ['test_mode', 'logger'] fields = {key: value for key, value in self.__dict__.items() @@ -339,6 +343,33 @@ def download_video_metadata(self): def process(self, working_dir): """Process video""" + + def download_video(): + """Helper method to download a YouTube video and return its absolute path""" + # sanity checks + if self.local: + raise Exception(f"{self.source_file} is a local file") + try: + self.logger.debug(f"Downloading video: {self.source_file}") + ydl_opts = { + "format": 'worstvideo+worstaudio/worst', + "outtmpl": os.path.join(working_dir, "videoFile.%(ext)s"), + "nopart": True, + } + with yt_dlp.YoutubeDL(ydl_opts) as ytdl: + ytdl.download([self.source_file]) + + for ext in ["mp4", "mkv", "webm"]: + output_file = os.path.join(working_dir, f"videoFile.{ext}") + if os.path.exists(output_file): + return os.path.abspath(output_file) + raise Exception("Downloaded file not found in expected formats.") + + return os.path.abspath(output_file) + except Exception as e: + self.logger.error(e) + raise Exception(f"Error downloading video: {e}") + try: self.logger.debug(f"Video processing: '{self.source_file}'") media_processor = MediaProcessor() @@ -426,4 +457,4 @@ def __config_source(self): self.entries.append(source) else: self.logger.warning( - f"Invalid source for '{entry.title}'. '{enclosure.type}' not supported for RSS feeds, source skipped.") + f"Invalid source for '{entry.title}'. '{enclosure.type}' not supported for RSS feeds, source skipped.") \ No newline at end of file diff --git a/app/transcription.py b/app/transcription.py index 2c7004f..26288bc 100644 --- a/app/transcription.py +++ b/app/transcription.py @@ -14,6 +14,7 @@ from app.data_fetcher import DataFetcher from app.github_api_handler import GitHubAPIHandler from app.exporters import ExporterFactory, TranscriptExporter +from app.services.llm_service import LLMFactory class Transcription: @@ -36,6 +37,11 @@ def __init__( batch_preprocessing_output=False, needs_review=False, include_metadata=True, + correct=False, + summarize_llm=False, + llm_provider="openai", + llm_correction_model="gpt-4o-mini", + llm_summary_model="gpt-4o", ): self.nocleanup = nocleanup self.status = "idle" # Can be "idle", "in_progress", or "completed" @@ -88,6 +94,17 @@ def __init__( self.preprocessing_output = [] if batch_preprocessing_output else None self.data_fetcher = DataFetcher(settings.BTC_TRANSCRIPTS_URL) + self.correct = correct + self.summarize_llm = summarize_llm + self.llm_provider = llm_provider + self.llm_correction_model = llm_correction_model + self.llm_summary_model = llm_summary_model + + if self.correct or self.summarize_llm: + self.llm_service = LLMFactory.create_llm_service(self.llm_provider) + else: + self.llm_service = None + self.logger.debug(f"Temp directory: {self.tmp_dir}") def _create_subdirectory(self, subdir_name): @@ -485,6 +502,22 @@ def postprocess(self, transcript: Transcript) -> None: with the existing code. """ try: + # --- NEW LLM REFINEMENT STEP --- + if self.llm_service: + if self.correct and transcript.outputs["raw"]: + self.logger.info(f"Correcting transcript with {self.llm_provider}...") + corrected_text = self.llm_service.correct_transcript( + transcript.outputs["raw"], self.llm_correction_model + ) + transcript.outputs["raw"] = corrected_text # Overwrite with corrected version + + if self.summarize_llm and transcript.outputs["raw"]: + self.logger.info(f"Summarizing transcript with {self.llm_provider}...") + summary = self.llm_service.summarize_text( + transcript.outputs["raw"], self.llm_summary_model + ) + transcript.summary = summary # Overwrite summary + # Handle markdown output if self.markdown or self.github_handler: transcript.outputs["markdown"] = self.write_to_markdown_file( diff --git a/config.ini.example b/config.ini.example index a3244bb..d215d70 100644 --- a/config.ini.example +++ b/config.ini.example @@ -11,3 +11,8 @@ one_sentence_per_line = True verbose_logging = True server_mode = dev server_verbose = True + +[llm] +llm_provider = openai +llm_correction_model = gpt-4o +llm_summary_model = gpt-3.5 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 97bd365..f8dbc58 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,7 +8,7 @@ requests==2.32.3 setuptools==45.2.0 static_ffmpeg==2.3 ffmpeg-python==0.2.0 -yt-dlp==2025.2.19 +yt-dlp>=2025.2.19 deepgram-sdk==2.4.0 boto3==1.26.143 black==23.3.0 diff --git a/routes/transcription.py b/routes/transcription.py index 2d867f6..bdb896c 100644 --- a/routes/transcription.py +++ b/routes/transcription.py @@ -112,6 +112,9 @@ async def add_to_queue( cutoff_date: Optional[str] = Form(None), source: Optional[str] = Form(None), source_file: Optional[UploadFile] = File(None), + correct: bool = Form(False), + summarize_llm: bool = Form(False), + llm_provider: str = Form("openai"), ): temp_file_path = None try: @@ -130,6 +133,9 @@ async def add_to_queue( include_metadata=not no_metadata, text_output=text, needs_review=needs_review, + correct=correct, + summarize_llm=summarize_llm, + llm_provider=llm_provider, ) if source_file: with tempfile.NamedTemporaryFile(delete=False) as tmp: diff --git a/transcriber.py b/transcriber.py index a0b23c1..e6e188b 100644 --- a/transcriber.py +++ b/transcriber.py @@ -210,6 +210,25 @@ def print_help(ctx, param, value): help="Supply this flag to enable verbose logging", ) +correct_transcript = click.option( + "--correct", + is_flag=True, + default=False, + help="Correct the transcript using the configured LLM provider.", +) +summarize_llm = click.option( + "--summarize-llm", + is_flag=True, + default=False, + help="Generate a new summary using the configured LLM provider.", +) +llm_provider = click.option( + "--llm-provider", + type=click.Choice(["openai", "google", "claude"]), + default=settings.config.get("llm_provider", "openai"), + help="LLM provider for correction and summarization.", +) + add_loc = click.option( "--loc", default="misc", @@ -279,6 +298,9 @@ def print_help(ctx, param, value): @nocleanup @verbose_logging @auto_start_server +@correct_transcript +@summarize_llm +@llm_provider def transcribe( source: str, loc: str, @@ -303,7 +325,9 @@ def transcribe( no_metadata: bool, needs_review: bool, cutoff_date: str, - nocheck: bool, + correct: bool, + summarize_llm: bool, + llm_provider: str, ) -> None: """Transcribe the provided sources. Suported sources include: \n - YouTube videos and playlists\n @@ -342,7 +366,9 @@ def transcribe( "include_metadata": not no_metadata, "needs_review": needs_review, "cutoff_date": cutoff_date, - "nocheck": nocheck, + "correct": correct, + "summarize_llm": summarize_llm, + "llm_provider": llm_provider, } try: queue_response = api_client.add_to_queue(data, source) From 25edacd9f3a570996c1b9a56b918cbee3f195d78 Mon Sep 17 00:00:00 2001 From: staru09 Date: Sun, 13 Jul 2025 00:15:25 +0530 Subject: [PATCH 2/7] flags tested --- app/transcription.py | 74 +++++++++++++++++++++++++++++++------------- 1 file changed, 52 insertions(+), 22 deletions(-) diff --git a/app/transcription.py b/app/transcription.py index 26288bc..1deaf7c 100644 --- a/app/transcription.py +++ b/app/transcription.py @@ -502,36 +502,66 @@ def postprocess(self, transcript: Transcript) -> None: with the existing code. """ try: - # --- NEW LLM REFINEMENT STEP --- - if self.llm_service: - if self.correct and transcript.outputs["raw"]: - self.logger.info(f"Correcting transcript with {self.llm_provider}...") - corrected_text = self.llm_service.correct_transcript( - transcript.outputs["raw"], self.llm_correction_model - ) - transcript.outputs["raw"] = corrected_text # Overwrite with corrected version + text_exporter = self.exporters.get("text") + + # --- Step 1: Save the original raw transcript if it exists --- + if transcript.outputs["raw"] and text_exporter: + raw_output_dir = text_exporter.get_output_path(transcript) + raw_file_path = text_exporter.construct_file_path( + directory=raw_output_dir, + filename=f"{transcript.title}_raw", + file_type="txt", + include_timestamp=False + ) + text_exporter.write_to_file(transcript.outputs["raw"], raw_file_path) + self.logger.info(f"Raw transcript saved to: {raw_file_path}") + + # --- Step 2: Perform LLM Correction --- + if self.llm_service and self.correct and transcript.outputs["raw"]: + self.logger.info(f"Correcting transcript with {self.llm_provider}...") + corrected_transcript_text = self.llm_service.correct_transcript( + transcript.outputs["raw"], self.llm_correction_model + ) - if self.summarize_llm and transcript.outputs["raw"]: - self.logger.info(f"Summarizing transcript with {self.llm_provider}...") - summary = self.llm_service.summarize_text( - transcript.outputs["raw"], self.llm_summary_model + if text_exporter: + corrected_output_dir = text_exporter.get_output_path(transcript) + corrected_file_path = text_exporter.construct_file_path( + directory=corrected_output_dir, + filename=f"{transcript.title}_corrected", + file_type="txt", + include_timestamp=False ) - transcript.summary = summary # Overwrite summary + text_exporter.write_to_file(corrected_transcript_text, corrected_file_path) + self.logger.info(f"Corrected transcript saved to: {corrected_file_path}") - # Handle markdown output + # Overwrite the main "raw" output so subsequent steps use the corrected version + transcript.outputs["raw"] = corrected_transcript_text + + # --- Step 3: Perform LLM Summarization --- + if self.llm_service and self.summarize_llm and transcript.outputs["raw"]: + self.logger.info(f"Summarizing transcript with {self.llm_provider}...") + summary_text = self.llm_service.summarize_text( + transcript.outputs["raw"], self.llm_summary_model + ) + transcript.summary = summary_text + + if text_exporter: + summary_output_dir = text_exporter.get_output_path(transcript) + summary_file_path = text_exporter.construct_file_path( + directory=summary_output_dir, + filename=f"{transcript.title}_summary", + file_type="txt", + include_timestamp=False + ) + text_exporter.write_to_file(summary_text, summary_file_path) + self.logger.info(f"Summary saved to: {summary_file_path}") + + # --- Step 4: Run existing exporters for final outputs (Markdown, JSON, etc.) --- if self.markdown or self.github_handler: transcript.outputs["markdown"] = self.write_to_markdown_file( transcript, ) - if "text" in self.exporters: - try: - transcript.outputs["text"] = self.exporters["text"].export( - transcript, add_timestamp=False - ) - except Exception as e: - self.logger.warning(f"Text exporter failed: {e}") - if "json" in self.exporters: transcript.outputs["json"] = self.exporters["json"].export( transcript From 84bf3f21273758bef6ee05ed6049dec6efdd121d Mon Sep 17 00:00:00 2001 From: staru09 Date: Mon, 14 Jul 2025 01:28:13 +0530 Subject: [PATCH 3/7] WIP: summarizer and correction --- app/exporters.py | 25 ++++-- app/github_api_handler.py | 35 ++++++--- app/services/correction.py | 42 ++++++++++ app/services/llm_service.py | 47 ------------ app/services/summarizer.py | 30 ++++++++ app/transcription.py | 148 ++++++++++++------------------------ config.ini.example | 8 +- routes/transcription.py | 2 - transcriber.py | 13 +--- 9 files changed, 170 insertions(+), 180 deletions(-) create mode 100644 app/services/correction.py delete mode 100644 app/services/llm_service.py create mode 100644 app/services/summarizer.py diff --git a/app/exporters.py b/app/exporters.py index e8ef544..ccfd15e 100644 --- a/app/exporters.py +++ b/app/exporters.py @@ -198,7 +198,7 @@ def _create_with_metadata(self, transcript: Transcript, **kwargs) -> str: Args: transcript: The transcript to export - **kwargs: Additional parameters like review_flag + **kwargs: Additional parameters like review_flag and content_key Returns: The complete Markdown content with metadata @@ -215,6 +215,13 @@ def increase_indent(self, flow=False, indentless=False): # Get metadata from the source metadata = transcript.source.to_json() + # Determine which content to use + content_key = kwargs.get("content_key", "corrected_text") + content = transcript.outputs.get(content_key, transcript.outputs.get("raw")) + + if content is None: + raise Exception(f"No transcript content found for key '{content_key}' or 'raw'") + # Add or modify specific fields if self.transcript_by: review_flag = kwargs.get("review_flag", "") @@ -312,6 +319,8 @@ def export(self, transcript: Transcript, **kwargs) -> str: Args: transcript: The transcript to export add_timestamp: Whether to add a timestamp to the filename (default: False) + content_key: The key in transcript.outputs to use for the content (default: "raw") + suffix: A suffix to add to the filename (e.g., "_raw") **kwargs: Additional parameters (unused) Returns: @@ -319,11 +328,17 @@ def export(self, transcript: Transcript, **kwargs) -> str: """ self.logger.debug("Exporting transcript to plain text...") - if transcript.outputs["raw"] is None: - raise Exception("No transcript content found") + content_key = kwargs.get("content_key", "raw") + content = transcript.outputs.get(content_key) + if content is None and content_key == "summary": + content = transcript.summary + + if content is None: + raise Exception(f"No content found for key: {content_key}") # Get parameters add_timestamp = kwargs.get("add_timestamp", False) + suffix = kwargs.get("suffix", "") # Get output directory output_dir = self.get_output_path(transcript) @@ -331,13 +346,13 @@ def export(self, transcript: Transcript, **kwargs) -> str: # Construct file path file_path = self.construct_file_path( directory=output_dir, - filename=transcript.title, + filename=f"{transcript.title}{suffix}", file_type="txt", include_timestamp=add_timestamp, ) # Write to file - result_path = self.write_to_file(transcript.outputs["raw"], file_path) + result_path = self.write_to_file(content, file_path) self.logger.info(f"(exporter) Text file written to: {result_path}") return result_path diff --git a/app/github_api_handler.py b/app/github_api_handler.py index e5afb5d..127fb53 100644 --- a/app/github_api_handler.py +++ b/app/github_api_handler.py @@ -93,13 +93,17 @@ def create_branch(self, repo_type, branch_name, sha): response = self._make_request('POST', url, json=data) return response.json() - def create_or_update_file(self, repo_type, file_path, content, commit_message, branch): + def create_or_update_file(self, repo_type, file_path, content, commit_message, branch, get_sha=False): url = f"https://api.github.com/repos/{self.repos[repo_type]['owner']}/{self.repos[repo_type]['name']}/contents/{quote(file_path)}" data = { "message": commit_message, "content": base64.b64encode(content.encode()).decode(), "branch": branch } + if get_sha: + response = self._make_request('GET', url + f'?ref={branch}') + data['sha'] = response.json()['sha'] + response = self._make_request('PUT', url, json=data) return response.json() @@ -114,23 +118,34 @@ def create_pull_request(self, repo_type, title, head, base, body): response = self._make_request('POST', url, json=data) return response.json() - def push_transcripts(self, transcripts: list[Transcript]) -> str | None: + def push_transcripts(self, transcripts: list[Transcript], markdown_exporter) -> str | None: try: default_branch = self.get_default_branch('transcripts') branch_sha = self.get_branch_sha('transcripts', default_branch) - branch_name = f"transcripts-{''.join(random.choices('0123456789', k=6))}" + branch_name = f"transcripts-{'' .join(random.choices('0123456789', k=6))}" self.create_branch('transcripts', branch_name, branch_sha) for transcript in transcripts: - if transcript.outputs and transcript.outputs['markdown']: - with open(transcript.outputs['markdown'], 'r') as file: - content = file.read() + # First commit: Raw transcript + raw_content = markdown_exporter._create_with_metadata(transcript, content_key='raw') + self.create_or_update_file( + 'transcripts', + transcript.output_path_with_title + ".md", + raw_content, + f'ai(transcript): "{transcript.title}" (raw)', + branch_name + ) + + # Second commit: Corrected transcript + if transcript.outputs.get('corrected_text'): + corrected_content = markdown_exporter._create_with_metadata(transcript, content_key='corrected_text') self.create_or_update_file( 'transcripts', - transcript.output_path_with_title, - content, - f'ai(transcript): "{transcript.title}" ({transcript.source.loc})', - branch_name + transcript.output_path_with_title + ".md", + corrected_content, + f'ai(transcript): "{transcript.title}" (corrected)', + branch_name, + get_sha=True # We need the SHA of the file to update it ) pr = self.create_pull_request( diff --git a/app/services/correction.py b/app/services/correction.py new file mode 100644 index 0000000..2da1440 --- /dev/null +++ b/app/services/correction.py @@ -0,0 +1,42 @@ +from app.transcript import Transcript +from app.logging import get_logger +import openai +from app.config import settings + +logger = get_logger() + +class CorrectionService: + def __init__(self, provider='openai', model='gpt-4o'): + self.provider = provider + self.model = model + if self.provider == 'openai': + self.client = openai + self.client.api_key = settings.OPENAI_API_KEY + else: + raise ValueError(f"Unsupported LLM provider: {provider}") + + def process(self, transcript: Transcript, **kwargs): + logger.info(f"Correcting transcript with {self.provider}...") + keywords = kwargs.get('keywords', []) + + # Build the prompt + prompt = self._build_prompt(transcript.outputs['raw'], keywords) + + # Call the LLM + response = self.client.chat.completions.create( + model=self.model, + messages=[{"role": "user", "content": prompt}] + ) + corrected_text = response.choices[0].message.content + + # Store the corrected text in a new field + transcript.outputs['corrected_text'] = corrected_text + logger.info("Correction complete.") + + def _build_prompt(self, text, keywords): + prompt = "Please correct the following transcript for punctuation, grammar, and spelling. Do not change the content or the speaker labels." + if keywords: + prompt += "\n\nPlease pay special attention to the following keywords and ensure they are spelled correctly:\n- " + prompt += "\n- ".join(keywords) + prompt += f"\n\n---\n\n{text}" + return prompt diff --git a/app/services/llm_service.py b/app/services/llm_service.py deleted file mode 100644 index 797c3a1..0000000 --- a/app/services/llm_service.py +++ /dev/null @@ -1,47 +0,0 @@ -from abc import ABC, abstractmethod -import openai -from app.config import settings - -class BaseLLM(ABC): - @abstractmethod - def correct_transcript(self, transcript_text: str, model: str) -> str: - pass - - @abstractmethod - def summarize_text(self, text: str, model: str) -> str: - pass - -class OpenAI_LLM(BaseLLM): - def __init__(self, api_key): - openai.api_key = api_key - - def correct_transcript(self, transcript_text: str, model: str) -> str: - prompt = f"""Please correct the following transcript for punctuation, grammar, and spelling. Do not change the content. - --- - {transcript_text}""" - - response = openai.chat.completions.create( - model=model, - messages=[{"role": "user", "content": prompt}] - ) - return response.choices[0].message.content - - def summarize_text(self, text: str, model: str) -> str: - prompt = f"""Please summarize the following text. - --- - {text}""" - - response = openai.chat.completions.create( - model=model, - messages=[{"role": "user", "content": prompt}] - ) - return response.choices[0].message.content - -class LLMFactory: - @staticmethod - def create_llm_service(provider: str) -> BaseLLM: - if provider == "openai": - return OpenAI_LLM(api_key=settings.OPENAI_API_KEY) - # Add other providers here - else: - raise ValueError(f"Unsupported LLM provider: {provider}") diff --git a/app/services/summarizer.py b/app/services/summarizer.py new file mode 100644 index 0000000..0f6eca2 --- /dev/null +++ b/app/services/summarizer.py @@ -0,0 +1,30 @@ +from app.transcript import Transcript +from app.logging import get_logger +import openai +from app.config import settings + +logger = get_logger() + +class SummarizerService: + def __init__(self, provider='openai', model='gpt-4o'): + self.provider = provider + self.model = model + if self.provider == 'openai': + self.client = openai + self.client.api_key = settings.OPENAI_API_KEY + else: + raise ValueError(f"Unsupported LLM provider: {provider}") + + def process(self, transcript: Transcript, **kwargs): + logger.info(f"Summarizing transcript with {self.provider}...") + text_to_summarize = transcript.outputs.get('corrected_text', transcript.outputs['raw']) + + prompt = f"""Please summarize the following text.\n---\n{text_to_summarize}""" + + response = self.client.chat.completions.create( + model=self.model, + messages=[{"role": "user", "content": prompt}] + ) + summary = response.choices[0].message.content + transcript.summary = summary + logger.info("Summarization complete.") diff --git a/app/transcription.py b/app/transcription.py index 1deaf7c..69f1245 100644 --- a/app/transcription.py +++ b/app/transcription.py @@ -14,7 +14,8 @@ from app.data_fetcher import DataFetcher from app.github_api_handler import GitHubAPIHandler from app.exporters import ExporterFactory, TranscriptExporter -from app.services.llm_service import LLMFactory +from app.services.correction import CorrectionService +from app.services.summarizer import SummarizerService class Transcription: @@ -38,13 +39,12 @@ def __init__( needs_review=False, include_metadata=True, correct=False, - summarize_llm=False, llm_provider="openai", - llm_correction_model="gpt-4o-mini", + llm_correction_model="gpt-4o", llm_summary_model="gpt-4o", ): self.nocleanup = nocleanup - self.status = "idle" # Can be "idle", "in_progress", or "completed" + self.status = "idle" self.test_mode = test_mode self.logger = get_logger() self.tmp_dir = ( @@ -52,7 +52,6 @@ def __init__( ) self.transcript_by = self.__configure_username(username) - # during testing we need to create the markdown for validation purposes self.markdown = markdown or test_mode self.include_metadata = include_metadata @@ -60,17 +59,16 @@ def __init__( self.__configure_tstbtc_metadata_dir() ) - # Create exporters for transcript output formats - export_config = { - "markdown": self.markdown, - "text_output": text_output, - "json": json, - "model_output_dir": model_output_dir, - } self.exporters: dict[ str, TranscriptExporter ] = ExporterFactory.create_exporters( - config=export_config, transcript_by=self.transcript_by + config={ + "markdown": self.markdown, + "text_output": text_output, + "json": json, + "model_output_dir": model_output_dir, + }, + transcript_by=self.transcript_by, ) self.model_output_dir = model_output_dir @@ -80,8 +78,12 @@ def __init__( self.github_handler = GitHubAPIHandler() self.review_flag = self.__configure_review_flag(needs_review) - # @TODO: use ExporterFactory instead of `metadata_writer` for - # services metadata output + self.processing_services = [] + if correct: + self.processing_services.append(CorrectionService(provider=llm_provider, model=llm_correction_model)) + if summarize: + self.processing_services.append(SummarizerService(provider=llm_provider, model=llm_summary_model)) + if deepgram: self.service = services.Deepgram( summarize, diarize, upload, self.metadata_writer @@ -92,18 +94,7 @@ def __init__( self.transcripts: list[Transcript] = [] self.existing_media = None self.preprocessing_output = [] if batch_preprocessing_output else None - self.data_fetcher = DataFetcher(settings.BTC_TRANSCRIPTS_URL) - - self.correct = correct - self.summarize_llm = summarize_llm - self.llm_provider = llm_provider - self.llm_correction_model = llm_correction_model - self.llm_summary_model = llm_summary_model - - if self.correct or self.summarize_llm: - self.llm_service = LLMFactory.create_llm_service(self.llm_provider) - else: - self.llm_service = None + self.data_fetcher = DataFetcher(base_url="http://btctranscripts.com") self.logger.debug(f"Temp directory: {self.tmp_dir}") @@ -435,6 +426,7 @@ def start(self, test_transcript=None): self.service.transcribe(transcript) transcript.status = "completed" self.postprocess(transcript) + self.export(transcript) self.status = "completed" if self.github: @@ -448,7 +440,12 @@ def push_to_github(self, transcripts: list[Transcript]): if not self.github_handler: return - pr_url_transcripts = self.github_handler.push_transcripts(transcripts) + markdown_exporter = self.exporters.get("markdown") + if not markdown_exporter: + self.logger.error("Markdown exporter not configured, cannot push to GitHub.") + return + + pr_url_transcripts = self.github_handler.push_transcripts(transcripts, markdown_exporter) if pr_url_transcripts: self.logger.info( f"transcripts: Pull request created: {pr_url_transcripts}" @@ -496,79 +493,30 @@ def write_to_markdown_file(self, transcript: Transcript): raise Exception(f"Error writing to markdown file: {e}") def postprocess(self, transcript: Transcript) -> None: - """ - Process the transcript to produce output files in the configured formats. - This updated method uses exporters when available, but maintains compatibility - with the existing code. - """ - try: - text_exporter = self.exporters.get("text") - - # --- Step 1: Save the original raw transcript if it exists --- - if transcript.outputs["raw"] and text_exporter: - raw_output_dir = text_exporter.get_output_path(transcript) - raw_file_path = text_exporter.construct_file_path( - directory=raw_output_dir, - filename=f"{transcript.title}_raw", - file_type="txt", - include_timestamp=False - ) - text_exporter.write_to_file(transcript.outputs["raw"], raw_file_path) - self.logger.info(f"Raw transcript saved to: {raw_file_path}") - - # --- Step 2: Perform LLM Correction --- - if self.llm_service and self.correct and transcript.outputs["raw"]: - self.logger.info(f"Correcting transcript with {self.llm_provider}...") - corrected_transcript_text = self.llm_service.correct_transcript( - transcript.outputs["raw"], self.llm_correction_model - ) - - if text_exporter: - corrected_output_dir = text_exporter.get_output_path(transcript) - corrected_file_path = text_exporter.construct_file_path( - directory=corrected_output_dir, - filename=f"{transcript.title}_corrected", - file_type="txt", - include_timestamp=False - ) - text_exporter.write_to_file(corrected_transcript_text, corrected_file_path) - self.logger.info(f"Corrected transcript saved to: {corrected_file_path}") - - # Overwrite the main "raw" output so subsequent steps use the corrected version - transcript.outputs["raw"] = corrected_transcript_text - - # --- Step 3: Perform LLM Summarization --- - if self.llm_service and self.summarize_llm and transcript.outputs["raw"]: - self.logger.info(f"Summarizing transcript with {self.llm_provider}...") - summary_text = self.llm_service.summarize_text( - transcript.outputs["raw"], self.llm_summary_model - ) - transcript.summary = summary_text - - if text_exporter: - summary_output_dir = text_exporter.get_output_path(transcript) - summary_file_path = text_exporter.construct_file_path( - directory=summary_output_dir, - filename=f"{transcript.title}_summary", - file_type="txt", - include_timestamp=False - ) - text_exporter.write_to_file(summary_text, summary_file_path) - self.logger.info(f"Summary saved to: {summary_file_path}") - - # --- Step 4: Run existing exporters for final outputs (Markdown, JSON, etc.) --- - if self.markdown or self.github_handler: - transcript.outputs["markdown"] = self.write_to_markdown_file( - transcript, - ) - - if "json" in self.exporters: - transcript.outputs["json"] = self.exporters["json"].export( - transcript - ) + for service in self.processing_services: + service.process(transcript) + + def export(self, transcript: Transcript): + """Exports the transcript to the configured formats.""" + text_exporter = self.exporters.get("text") + if text_exporter: + # Save raw, corrected, and summary files + if transcript.outputs.get("raw"): + text_exporter.export(transcript, add_timestamp=False, content_key="raw", suffix="_raw") + if transcript.outputs.get("corrected_text"): + text_exporter.export(transcript, add_timestamp=False, content_key="corrected_text", suffix="_corrected") + if transcript.summary: + text_exporter.export(transcript, add_timestamp=False, content_key="summary", suffix="_summary") + + if self.markdown or self.github_handler: + transcript.outputs["markdown"] = self.write_to_markdown_file( + transcript, + ) - except Exception as e: - raise Exception(f"Error with postprocessing: {e}") from e + if "json" in self.exporters: + transcript.outputs["json"] = self.exporters["json"].export( + transcript + ) def clean_up(self): self.logger.debug("Cleaning up...") diff --git a/config.ini.example b/config.ini.example index d215d70..bc3eafd 100644 --- a/config.ini.example +++ b/config.ini.example @@ -6,13 +6,11 @@ github = False save_to_markdown = True needs_review = False one_sentence_per_line = True +llm_provider = openai +llm_correction_model = gpt-4o +llm_summary_model = gpt-4o [development] verbose_logging = True server_mode = dev server_verbose = True - -[llm] -llm_provider = openai -llm_correction_model = gpt-4o -llm_summary_model = gpt-3.5 \ No newline at end of file diff --git a/routes/transcription.py b/routes/transcription.py index bdb896c..30d554a 100644 --- a/routes/transcription.py +++ b/routes/transcription.py @@ -113,7 +113,6 @@ async def add_to_queue( source: Optional[str] = Form(None), source_file: Optional[UploadFile] = File(None), correct: bool = Form(False), - summarize_llm: bool = Form(False), llm_provider: str = Form("openai"), ): temp_file_path = None @@ -134,7 +133,6 @@ async def add_to_queue( text_output=text, needs_review=needs_review, correct=correct, - summarize_llm=summarize_llm, llm_provider=llm_provider, ) if source_file: diff --git a/transcriber.py b/transcriber.py index e6e188b..c6335d0 100644 --- a/transcriber.py +++ b/transcriber.py @@ -117,7 +117,7 @@ def print_help(ctx, param, value): is_flag=True, default=settings.config.getboolean("summarize", False), show_default=True, - help="Summarize the transcript [only available with deepgram]", + help="Summarize the transcript using the configured LLM provider.", ) cutoff_date = click.option( "--cutoff-date", @@ -213,15 +213,9 @@ def print_help(ctx, param, value): correct_transcript = click.option( "--correct", is_flag=True, - default=False, + default=settings.config.getboolean("correct", False), help="Correct the transcript using the configured LLM provider.", ) -summarize_llm = click.option( - "--summarize-llm", - is_flag=True, - default=False, - help="Generate a new summary using the configured LLM provider.", -) llm_provider = click.option( "--llm-provider", type=click.Choice(["openai", "google", "claude"]), @@ -299,7 +293,6 @@ def print_help(ctx, param, value): @verbose_logging @auto_start_server @correct_transcript -@summarize_llm @llm_provider def transcribe( source: str, @@ -326,7 +319,6 @@ def transcribe( needs_review: bool, cutoff_date: str, correct: bool, - summarize_llm: bool, llm_provider: str, ) -> None: """Transcribe the provided sources. Suported sources include: \n @@ -367,7 +359,6 @@ def transcribe( "needs_review": needs_review, "cutoff_date": cutoff_date, "correct": correct, - "summarize_llm": summarize_llm, "llm_provider": llm_provider, } try: From 538e0ee0a26a6267def55f4f5fa5a786bcaf2928 Mon Sep 17 00:00:00 2001 From: staru09 Date: Mon, 4 Aug 2025 22:54:59 +0530 Subject: [PATCH 4/7] correction prompt updated --- app/services/correction.py | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/app/services/correction.py b/app/services/correction.py index 2da1440..6e5c166 100644 --- a/app/services/correction.py +++ b/app/services/correction.py @@ -19,8 +19,9 @@ def process(self, transcript: Transcript, **kwargs): logger.info(f"Correcting transcript with {self.provider}...") keywords = kwargs.get('keywords', []) - # Build the prompt - prompt = self._build_prompt(transcript.outputs['raw'], keywords) + metadata = transcript.source.to_json() + + prompt = self._build_prompt(transcript.outputs['raw'], keywords, metadata) # Call the LLM response = self.client.chat.completions.create( @@ -33,10 +34,29 @@ def process(self, transcript: Transcript, **kwargs): transcript.outputs['corrected_text'] = corrected_text logger.info("Correction complete.") - def _build_prompt(self, text, keywords): - prompt = "Please correct the following transcript for punctuation, grammar, and spelling. Do not change the content or the speaker labels." + def _build_prompt(self, text, keywords, metadata): + prompt = ( + "You are a domain expert in Bitcoin and blockchain technologies.\n\n" + "The following transcript was generated using an automatic speech recognition (ASR) system. " + "Your task is to correct it based on the contextual metadata provided.\n\n" + "--- Contextual Metadata ---\n" + ) + + if metadata.get('title'): + prompt += f"Title: {metadata['title']}\n" + if metadata.get('speakers'): + prompt += f"Speakers: {', '.join(metadata['speakers'])}\n" + if metadata.get('tags'): + prompt += f"Tags: {', '.join(metadata['tags'])}\n" + + prompt += "Please use this metadata to improve the accuracy of your corrections.\n" + if keywords: - prompt += "\n\nPlease pay special attention to the following keywords and ensure they are spelled correctly:\n- " + prompt += ( + "\nAdditionally, prioritize the following keywords. Ensure they are spelled, cased, and formatted correctly " + "whenever they appear in the transcript:\n- " + ) prompt += "\n- ".join(keywords) - prompt += f"\n\n---\n\n{text}" + + prompt += f"\n\n--- Transcript Start ---\n\n{text.strip()}\n\n--- Transcript End ---" return prompt From 3ee213547843165144e2b2997b42e87bcb2afc85 Mon Sep 17 00:00:00 2001 From: staru09 Date: Mon, 4 Aug 2025 22:59:57 +0530 Subject: [PATCH 5/7] openai added to requirements --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index f8dbc58..949e0fc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,4 +20,5 @@ librosa==0.10.2.post1 fastapi==0.111.0 PyJWT==2.9.0 cryptography==43.0.1 -psutil==7.0.0 \ No newline at end of file +psutil==7.0.0 +openai==1.98.0 From 5dfa37397e710daacb68899d39a39a7551766fff Mon Sep 17 00:00:00 2001 From: staru09 Date: Wed, 6 Aug 2025 00:02:41 +0530 Subject: [PATCH 6/7] tests updated --- requirements.txt | 1 + test/exporters/test_text.py | 2 +- .../test_transcription_exporters.py | 98 +++++++++++++++---- 3 files changed, 80 insertions(+), 21 deletions(-) diff --git a/requirements.txt b/requirements.txt index 949e0fc..7822410 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,3 +22,4 @@ PyJWT==2.9.0 cryptography==43.0.1 psutil==7.0.0 openai==1.98.0 +pytest-asyncio==0.21.0 \ No newline at end of file diff --git a/test/exporters/test_text.py b/test/exporters/test_text.py index ce22a67..94cdfba 100644 --- a/test/exporters/test_text.py +++ b/test/exporters/test_text.py @@ -60,7 +60,7 @@ def test_error_handling_no_content(self, text_exporter, mock_transcript): text_exporter.export(mock_transcript) # Check the error message - assert "No transcript content found" in str(excinfo.value) + assert "No content found for key: raw" in str(excinfo.value) def test_output_directory_creation(self, temp_dir, mock_transcript): """Test that the exporter creates output directories as needed""" diff --git a/test/integration/test_transcription_exporters.py b/test/integration/test_transcription_exporters.py index 9c2afeb..fb068d3 100644 --- a/test/integration/test_transcription_exporters.py +++ b/test/integration/test_transcription_exporters.py @@ -82,18 +82,18 @@ def test_write_to_markdown_file( # Check the result assert result == "/path/to/exported/markdown.md" - def test_postprocess_with_markdown( + def test_export_with_markdown( self, transcription_with_exporters, mock_transcript ): - """Test postprocess with markdown output""" + """Test export with markdown output""" # Mock the write_to_markdown_file method to avoid calling the exporter directly transcription_with_exporters.write_to_markdown_file = mock.MagicMock() transcription_with_exporters.write_to_markdown_file.return_value = ( "/path/to/exported/markdown.md" ) - # Call postprocess - transcription_with_exporters.postprocess(mock_transcript) + # Call export + transcription_with_exporters.export(mock_transcript) # Check that write_to_markdown_file was called transcription_with_exporters.write_to_markdown_file.assert_called_once() @@ -104,13 +104,13 @@ def test_postprocess_with_markdown( == "/path/to/exported/markdown.md" ) - def test_postprocess_with_text( + def test_export_with_text( self, transcription_with_exporters, mock_transcript, mock_transcription_deps, ): - """Test postprocess with text output""" + """Test export with text output""" # Get the mock exporters exporters = mock_transcription_deps[ "ExporterFactory" @@ -126,23 +126,17 @@ def test_postprocess_with_text( "/path/to/exported/markdown.md" ) - # Call postprocess - transcription_with_exporters.postprocess(mock_transcript) + # Call export + transcription_with_exporters.export(mock_transcript) # Check that the text exporter was called - text_exporter.export.assert_called_once() + text_exporter.export.assert_called() assert text_exporter.export.call_args[1]["add_timestamp"] is False - # Check that the output was stored in the transcript - assert ( - mock_transcript.outputs["text"] - == "/path/to/exported/transcript.txt" - ) - - def test_postprocess_with_json( + def test_export_with_json( self, transcription_with_exporters, mock_transcript, mock_transcription_deps ): - """Test postprocess with JSON output""" + """Test export with JSON output""" # Get the mock exporters exporters = mock_transcription_deps[ "ExporterFactory" @@ -158,8 +152,8 @@ def test_postprocess_with_json( "/path/to/exported/markdown.md" ) - # Call postprocess - transcription_with_exporters.postprocess(mock_transcript) + # Call export + transcription_with_exporters.export(mock_transcript) # Check that the json exporter was called json_exporter.export.assert_called_once() @@ -168,4 +162,68 @@ def test_postprocess_with_json( assert ( mock_transcript.outputs["json"] == "/path/to/exported/transcript.json" - ) \ No newline at end of file + ) + + def test_export_with_all_outputs( + self, + transcription_with_exporters, + mock_transcript, + mock_transcription_deps, + ): + """Test export with all outputs enabled""" + # Get mock exporters + exporters = mock_transcription_deps[ + "ExporterFactory" + ].create_exporters.return_value + text_exporter = exporters["text"] + json_exporter = exporters["json"] + + # Set up return values + text_exporter.export.return_value = "/path/to/text.txt" + json_exporter.export.return_value = "/path/to/json.json" + + # Mock write_to_markdown_file + transcription_with_exporters.write_to_markdown_file = mock.MagicMock() + transcription_with_exporters.write_to_markdown_file.return_value = ( + "/path/to/markdown.md" + ) + + # Call export + transcription_with_exporters.export(mock_transcript) + + # Check that all exporters were called + text_exporter.export.assert_called() + json_exporter.export.assert_called_once() + transcription_with_exporters.write_to_markdown_file.assert_called_once() + + # Check that all outputs are stored + assert mock_transcript.outputs["json"] == "/path/to/json.json" + assert mock_transcript.outputs["markdown"] == "/path/to/markdown.md" + + def test_export_no_outputs( + self, patched_transcription, mock_transcript, mock_transcription_deps + ): + """Test export with no outputs enabled""" + # Create a Transcription instance with all export options disabled + transcription = Transcription( + markdown=False, text_output=False, json=False, username="Test User" + ) + transcription.exporters.clear() + + # Clear the mock transcript's outputs and add back the raw output + mock_transcript.outputs.clear() + mock_transcript.outputs['raw'] = 'test transcript' + + # Mock write_to_markdown_file + transcription.write_to_markdown_file = mock.MagicMock() + + # Call export + transcription.export(mock_transcript) + + # Check that no exporters were called + transcription.write_to_markdown_file.assert_not_called() + + # Check that no outputs were stored + assert "text" not in mock_transcript.outputs + assert "json" not in mock_transcript.outputs + assert "markdown" not in mock_transcript.outputs From 7bb34d1cf9111e025e9dd07467fdf7ff1576baf2 Mon Sep 17 00:00:00 2001 From: staru09 Date: Sun, 17 Aug 2025 20:22:43 +0530 Subject: [PATCH 7/7] corrections fixed --- app/services/correction.py | 95 +++++++-- app/services/global_tag_manager.py | 314 +++++++++++++++++++++++++++++ app/transcription.py | 23 ++- 3 files changed, 412 insertions(+), 20 deletions(-) create mode 100644 app/services/global_tag_manager.py diff --git a/app/services/correction.py b/app/services/correction.py index 6e5c166..7501f91 100644 --- a/app/services/correction.py +++ b/app/services/correction.py @@ -1,5 +1,6 @@ from app.transcript import Transcript from app.logging import get_logger +from app.services.global_tag_manager import GlobalTagManager import openai from app.config import settings @@ -9,6 +10,7 @@ class CorrectionService: def __init__(self, provider='openai', model='gpt-4o'): self.provider = provider self.model = model + self.tag_manager = GlobalTagManager() if self.provider == 'openai': self.client = openai self.client.api_key = settings.OPENAI_API_KEY @@ -20,43 +22,108 @@ def process(self, transcript: Transcript, **kwargs): keywords = kwargs.get('keywords', []) metadata = transcript.source.to_json() + global_context = self.tag_manager.get_correction_context() - prompt = self._build_prompt(transcript.outputs['raw'], keywords, metadata) + prompt = self._build_enhanced_prompt(transcript.outputs['raw'], keywords, metadata, global_context) - # Call the LLM response = self.client.chat.completions.create( model=self.model, messages=[{"role": "user", "content": prompt}] ) corrected_text = response.choices[0].message.content - # Store the corrected text in a new field transcript.outputs['corrected_text'] = corrected_text logger.info("Correction complete.") - def _build_prompt(self, text, keywords, metadata): + def _build_enhanced_prompt(self, text, keywords, metadata, global_context): prompt = ( - "You are a domain expert in Bitcoin and blockchain technologies.\n\n" - "The following transcript was generated using an automatic speech recognition (ASR) system. " - "Your task is to correct it based on the contextual metadata provided.\n\n" - "--- Contextual Metadata ---\n" + "You are a transcript correction specialist with expertise in Bitcoin and blockchain terminology.\n\n" + "The following transcript was generated by automatic speech recognition (ASR). Your task is to " + "correct ONLY the obvious mistakes while keeping the transcript as close to the original as possible.\n\n" + "DO NOT:\n" + "- Rephrase or rewrite sentences\n" + "- Change the speaker's style or tone\n" + "- Add or remove content\n" + "- Make major structural changes\n\n" + "DO:\n" + "- Fix spelling errors and typos\n" + "- Correct misheard words using context\n" + "- Fix technical terminology and proper names\n" + "- Maintain the exact same flow and structure\n\n" + "--- Current Video Metadata ---\n" ) if metadata.get('title'): - prompt += f"Title: {metadata['title']}\n" + prompt += f"Video Title: {metadata['title']}\n" if metadata.get('speakers'): prompt += f"Speakers: {', '.join(metadata['speakers'])}\n" if metadata.get('tags'): - prompt += f"Tags: {', '.join(metadata['tags'])}\n" + prompt += f"Video Tags: {', '.join(metadata['tags'])}\n" + if metadata.get('categories'): + prompt += f"Categories: {', '.join(metadata['categories'])}\n" + if metadata.get('youtube', {}).get('description'): + description = metadata['youtube']['description'][:200] + "..." if len(metadata['youtube']['description']) > 200 else metadata['youtube']['description'] + prompt += f"Description: {description}\n" + + video_count = global_context.get('video_count', 0) + prompt += f"\n--- Global Bitcoin Knowledge Base (From {video_count} Transcripts) ---\n" + + if global_context.get('frequent_tags'): + frequent_tags = global_context['frequent_tags'][:15] + prompt += f"Most Common Topics: {', '.join(frequent_tags)}\n" + + if global_context.get('technical_terms'): + tech_terms = global_context['technical_terms'][:20] + prompt += f"Technical Terms to Recognize: {', '.join(tech_terms)}\n" + + if global_context.get('project_names'): + projects = global_context['project_names'][:15] + prompt += f"Bitcoin Projects/Tools: {', '.join(projects)}\n" + + if global_context.get('common_speakers'): + speakers = global_context['common_speakers'][:10] + prompt += f"Frequent Speakers: {', '.join(speakers)}\n" + + if global_context.get('common_categories'): + categories = global_context['common_categories'][:8] + prompt += f"Common Content Categories: {', '.join(categories)}\n" - prompt += "Please use this metadata to improve the accuracy of your corrections.\n" + if global_context.get('expertise_areas'): + areas = global_context['expertise_areas'][:8] + prompt += f"Domain Expertise Areas: {', '.join(areas)}\n" + + if global_context.get('domain_context'): + prompt += f"Primary Domain Focus: {global_context['domain_context']}\n" + + prompt += "\n--- Focus Areas for Correction ---\n" + prompt += "Using the metadata and global knowledge, focus on correcting:\n" + prompt += "1. Technical terms (ensure proper spelling and capitalization)\n" + prompt += "2. Speaker names and project names (match known variations)\n" + prompt += "3. Common ASR mishears (but, bit, big -> Bitcoin when context suggests it)\n" + prompt += "4. Homophones and similar-sounding words in Bitcoin context\n" + prompt += "5. Obvious typos and spelling mistakes\n\n" + prompt += "IMPORTANT: Make minimal changes - only fix clear errors, don't improve the text.\n" + + if global_context.get('tag_variations'): + variations = global_context['tag_variations'] + if variations: + prompt += "\n--- Common Term Variations ---\n" + for base_term, variants in list(variations.items())[:5]: + prompt += f"{base_term}: {', '.join(variants)}\n" if keywords: prompt += ( - "\nAdditionally, prioritize the following keywords. Ensure they are spelled, cased, and formatted correctly " - "whenever they appear in the transcript:\n- " + "\n--- Additional Priority Keywords ---\n" + "Pay special attention to these terms and ensure correct spelling/formatting:\n- " ) prompt += "\n- ".join(keywords) - prompt += f"\n\n--- Transcript Start ---\n\n{text.strip()}\n\n--- Transcript End ---" + prompt += f"\n\n--- Transcript Start ---\n\n{text.strip()}\n\n--- Transcript End ---\n\n" + prompt += "Return ONLY the corrected transcript. Make minimal changes - fix only obvious errors while " + prompt += "preserving the original wording, sentence structure, and speaker's natural expression." + return prompt + + def _build_prompt(self, text, keywords, metadata): + """Legacy method for backward compatibility""" + return self._build_enhanced_prompt(text, keywords, metadata, {}) diff --git a/app/services/global_tag_manager.py b/app/services/global_tag_manager.py new file mode 100644 index 0000000..3047ea9 --- /dev/null +++ b/app/services/global_tag_manager.py @@ -0,0 +1,314 @@ +import json +import os +from datetime import datetime, timezone +from collections import defaultdict +from typing import Dict, List, Any +from app.config import settings +from app.logging import get_logger +import re + +logger = get_logger() + +class GlobalTagManager: + """ + Manages a global dictionary of tags and terminology from all processed videos + to enhance correction accuracy across the entire corpus. + """ + + def __init__(self, metadata_dir=None): + self.metadata_dir = metadata_dir or settings.TSTBTC_METADATA_DIR or "metadata/" + self.dict_file = os.path.join(self.metadata_dir, "global_tag_dictionary.json") + self.tag_dict = self._load_dictionary() + + def _load_dictionary(self) -> Dict[str, Any]: + """Load existing global tag dictionary or create new one""" + if os.path.exists(self.dict_file): + try: + with open(self.dict_file, 'r', encoding='utf-8') as f: + return json.load(f) + except (json.JSONDecodeError, IOError) as e: + logger.warning(f"Failed to load global tag dictionary: {e}. Creating new one.") + + return self._create_new_dictionary() + + def _create_new_dictionary(self) -> Dict[str, Any]: + """Create a new global tag dictionary structure""" + return { + "last_updated": datetime.now(timezone.utc).isoformat(), + "version": "1.0", + "tags": {}, + "technical_terms": [], + "speaker_context": { + "common_speakers": [], + "expertise_areas": [] + }, + "project_names": [], + "categories": {}, + "video_count": 0, + "common_words": {} + } + + def _save_dictionary(self): + """Save the global tag dictionary to file""" + try: + os.makedirs(os.path.dirname(self.dict_file), exist_ok=True) + self.tag_dict["last_updated"] = datetime.now(timezone.utc).isoformat() + + with open(self.dict_file, 'w', encoding='utf-8') as f: + json.dump(self.tag_dict, f, indent=4, ensure_ascii=False) + + logger.debug(f"Global tag dictionary saved to {self.dict_file}") + except IOError as e: + logger.error(f"Failed to save global tag dictionary: {e}") + + def update_from_transcript(self, transcript): + """Update global dictionary with new transcript's metadata""" + try: + metadata = transcript.source.to_json() + + manual_tags = metadata.get('tags', []) + youtube_metadata = metadata.get('youtube', {}) + youtube_tags = youtube_metadata.get('tags', []) if youtube_metadata else [] + categories = metadata.get('categories', []) + speakers = metadata.get('speakers', []) + title = metadata.get('title', '') + description = youtube_metadata.get('description', '') if youtube_metadata else '' + + self.tag_dict["video_count"] = self.tag_dict.get("video_count", 0) + 1 + + all_tags = manual_tags + youtube_tags + categories + + for tag in all_tags: + if tag and isinstance(tag, str): + self._update_tag_entry(tag.strip()) + + for category in categories: + if category and isinstance(category, str): + self._update_category_frequency(category.strip()) + + text_content = f"{title} {description}".lower() + self._extract_technical_terms_dynamically(text_content, all_tags) + + for speaker in speakers: + if speaker and isinstance(speaker, str): + self._update_speaker_context(speaker.strip()) + + self._identify_project_names_dynamically(text_content, all_tags) + + self._update_expertise_areas(categories + all_tags) + + self._save_dictionary() + logger.info(f"Updated global tag dictionary with transcript: {title}") + + except Exception as e: + logger.error(f"Failed to update global tag dictionary: {e}") + + def _update_tag_entry(self, tag: str): + """Update or create entry for a tag in the dictionary""" + tag_lower = tag.lower() + tags_dict = self.tag_dict["tags"] + + if tag_lower in tags_dict: + tags_dict[tag_lower]["frequency"] += 1 + tags_dict[tag_lower]["last_seen"] = datetime.now(timezone.utc).isoformat() + + if tag not in tags_dict[tag_lower]["variations"]: + tags_dict[tag_lower]["variations"].append(tag) + else: + tags_dict[tag_lower] = { + "frequency": 1, + "variations": [tag], + "context": self._infer_context(tag_lower), + "first_seen": datetime.now(timezone.utc).isoformat(), + "last_seen": datetime.now(timezone.utc).isoformat() + } + + def _update_speaker_context(self, speaker: str): + """Update speaker information in the global context""" + speaker_context = self.tag_dict["speaker_context"] + if speaker not in speaker_context["common_speakers"]: + speaker_context["common_speakers"].append(speaker) + if len(speaker_context["common_speakers"]) > 50: + speaker_context["common_speakers"] = speaker_context["common_speakers"][-50:] + + def _update_category_frequency(self, category: str): + """Track category frequencies""" + categories_dict = self.tag_dict.get("categories", {}) + category_lower = category.lower() + categories_dict[category_lower] = categories_dict.get(category_lower, 0) + 1 + self.tag_dict["categories"] = categories_dict + + def _extract_technical_terms_dynamically(self, text_content: str, tags: List[str]): + """Dynamically extract technical terms from content and tags""" + technical_terms = self.tag_dict.get("technical_terms", []) + + bitcoin_indicators = ['bitcoin', 'btc', 'blockchain', 'crypto', 'lightning', 'ln'] + is_bitcoin_content = any(indicator in text_content or + any(indicator in tag.lower() for tag in tags) + for indicator in bitcoin_indicators) + + if is_bitcoin_content: + for tag in tags: + if tag and len(tag) > 3 and not tag.isdigit(): + tag_lower = tag.lower() + technical_indicators = ['network', 'protocol', 'script', 'sig', 'key', 'hash', + 'node', 'chain', 'block', 'tx', 'vault', 'channel'] + if (any(indicator in tag_lower for indicator in technical_indicators) or + tag_lower.startswith(('op_', 'bip', 'bolt')) or + tag_lower in ['taproot', 'segwit', 'multisig', 'htlc']): + if tag_lower not in technical_terms: + technical_terms.append(tag_lower) + + self.tag_dict["technical_terms"] = technical_terms + + def _identify_project_names_dynamically(self, text_content: str, tags: List[str]): + """Dynamically identify project names from content and tags""" + project_names = self.tag_dict.get("project_names", []) + + for tag in tags: + if tag and len(tag) > 2: + if (tag[0].isupper() or + 'core' in tag.lower() or 'lightning' in tag.lower() or + any(pattern in tag.lower() for pattern in ['btc', 'lightning', 'wallet', 'pay'])): + if tag not in project_names: + project_names.append(tag) + + self.tag_dict["project_names"] = project_names + + def _update_expertise_areas(self, tags_and_categories: List[str]): + """Update expertise areas based on categories and tags""" + expertise_areas = self.tag_dict.get("speaker_context", {}).get("expertise_areas", []) + + expertise_mapping = { + 'development': ['development', 'dev', 'programming', 'coding', 'technical'], + 'podcast': ['podcast', 'interview', 'discussion'], + 'conference': ['conference', 'talk', 'presentation', 'summit'], + 'education': ['education', 'tutorial', 'learning', 'teaching'], + 'mining': ['mining', 'miner', 'hashrate', 'pool'], + 'security': ['security', 'privacy', 'cryptography', 'audit'], + 'payments': ['payments', 'lightning', 'channel', 'transaction'], + 'trading': ['trading', 'exchange', 'market', 'price'] + } + + for item in tags_and_categories: + if item and isinstance(item, str): + item_lower = item.lower() + for area, keywords in expertise_mapping.items(): + if any(keyword in item_lower for keyword in keywords): + if area not in expertise_areas: + expertise_areas.append(area) + + speaker_context = self.tag_dict.get("speaker_context", {}) + speaker_context["expertise_areas"] = expertise_areas + self.tag_dict["speaker_context"] = speaker_context + + def _infer_context(self, tag: str) -> str: + """Infer context category for a tag""" + development_terms = ["script", "bdk", "core", "node", "api", "rpc", "development"] + payment_terms = ["lightning", "payment", "channel", "invoice", "bolt"] + security_terms = ["multisig", "signature", "key", "seed", "private", "security"] + mining_terms = ["mining", "hash", "difficulty", "block", "pow"] + + if any(term in tag for term in development_terms): + return "development" + elif any(term in tag for term in payment_terms): + return "payments" + elif any(term in tag for term in security_terms): + return "security" + elif any(term in tag for term in mining_terms): + return "mining" + else: + return "general" + + def get_correction_context(self) -> Dict[str, Any]: + """Get enriched context for correction prompts""" + tags_dict = self.tag_dict.get("tags", {}) + + frequent_tags = sorted( + tags_dict.items(), + key=lambda x: x[1]["frequency"], + reverse=True + )[:30] + + categories_dict = self.tag_dict.get("categories", {}) + common_categories = sorted( + categories_dict.items(), + key=lambda x: x[1], + reverse=True + )[:10] + + return { + 'frequent_tags': [tag for tag, _ in frequent_tags], + 'tag_variations': self._get_tag_variations(), + 'technical_terms': self.tag_dict.get('technical_terms', []), + 'project_names': self.tag_dict.get('project_names', []), + 'common_speakers': self.tag_dict.get('speaker_context', {}).get('common_speakers', [])[:20], + 'common_categories': [cat for cat, _ in common_categories], + 'expertise_areas': self.tag_dict.get('speaker_context', {}).get('expertise_areas', []), + 'domain_context': self._build_domain_context(), + 'video_count': self.tag_dict.get('video_count', 0) + } + + def _get_tag_variations(self) -> Dict[str, List[str]]: + """Get mapping of tags to their variations""" + variations = {} + for tag, data in self.tag_dict.get("tags", {}).items(): + if len(data["variations"]) > 1: + variations[tag] = data["variations"] + return variations + + def _build_domain_context(self) -> str: + """Build a domain context string for correction prompts based on actual data""" + context_parts = [] + + expertise_areas = self.tag_dict.get("speaker_context", {}).get("expertise_areas", []) + categories = self.tag_dict.get("categories", {}) + + if categories: + top_categories = sorted(categories.items(), key=lambda x: x[1], reverse=True)[:3] + for category, _ in top_categories: + if category in ["development", "technical"]: + context_parts.append("Bitcoin development and technical implementation") + elif category in ["education", "tutorial"]: + context_parts.append("Bitcoin education and learning") + elif category in ["podcast", "interview"]: + context_parts.append("Bitcoin discussion and interviews") + elif category in ["conference", "presentation"]: + context_parts.append("Bitcoin conferences and presentations") + + for area in expertise_areas[:3]: + if area == "payments" and "payment" not in " ".join(context_parts).lower(): + context_parts.append("Bitcoin payments and Lightning Network") + elif area == "security" and "security" not in " ".join(context_parts).lower(): + context_parts.append("Bitcoin security and cryptography") + elif area == "mining" and "mining" not in " ".join(context_parts).lower(): + context_parts.append("Bitcoin mining and network") + + return ", ".join(context_parts) or "Bitcoin and blockchain technology" + + def get_statistics(self) -> Dict[str, Any]: + """Get statistics about the global tag dictionary""" + tags_dict = self.tag_dict.get("tags", {}) + categories_dict = self.tag_dict.get("categories", {}) + + return { + "videos_processed": self.tag_dict.get("video_count", 0), + "total_unique_tags": len(tags_dict), + "total_tag_occurrences": sum(data["frequency"] for data in tags_dict.values()), + "technical_terms_count": len(self.tag_dict.get("technical_terms", [])), + "project_names_count": len(self.tag_dict.get("project_names", [])), + "speakers_count": len(self.tag_dict.get("speaker_context", {}).get("common_speakers", [])), + "categories_count": len(categories_dict), + "expertise_areas_count": len(self.tag_dict.get("speaker_context", {}).get("expertise_areas", [])), + "last_updated": self.tag_dict.get("last_updated"), + "most_frequent_tags": sorted( + tags_dict.items(), + key=lambda x: x[1]["frequency"], + reverse=True + )[:10], + "most_common_categories": sorted( + categories_dict.items(), + key=lambda x: x[1], + reverse=True + )[:5] if categories_dict else [] + } \ No newline at end of file diff --git a/app/transcription.py b/app/transcription.py index 69f1245..0c8cf66 100644 --- a/app/transcription.py +++ b/app/transcription.py @@ -16,6 +16,7 @@ from app.exporters import ExporterFactory, TranscriptExporter from app.services.correction import CorrectionService from app.services.summarizer import SummarizerService +from app.services.global_tag_manager import GlobalTagManager class Transcription: @@ -58,6 +59,9 @@ def __init__( self.metadata_writer = DataWriter( self.__configure_tstbtc_metadata_dir() ) + + # Initialize global tag manager + self.tag_manager = GlobalTagManager(self.__configure_tstbtc_metadata_dir()) self.exporters: dict[ str, TranscriptExporter @@ -212,13 +216,20 @@ def _new_transcript_from_source(self, source: Source): # Keep preprocessing outputs for later use self.preprocessing_output.append(source.to_json()) # Initialize new transcript from source - self.transcripts.append( - Transcript( - source=source, - test_mode=self.test_mode, - metadata_file=metadata_file, - ) + transcript = Transcript( + source=source, + test_mode=self.test_mode, + metadata_file=metadata_file, ) + + # Update global tag dictionary with new transcript metadata + try: + self.tag_manager.update_from_transcript(transcript) + self.logger.debug(f"Updated global tag dictionary with transcript: {source.title}") + except Exception as e: + self.logger.warning(f"Failed to update global tag dictionary: {e}") + + self.transcripts.append(transcript) def add_transcription_source( self,