bitcointranscripts · staru09 · Jul 11, 2025 · Jul 12, 2025 · Jul 13, 2025 · Aug 4, 2025
diff --git a/.gitignore b/.gitignore
@@ -133,4 +133,8 @@ dmypy.json
 # Pyre type checker
 .pyre/
 .username
-.idea
+.idea
+output/
+test_feats/
+logs/
+metadata/
diff --git a/app/config.py b/app/config.py
@@ -81,6 +81,22 @@ def GITHUB_INSTALLATION_ID(self):
         return self._get_env_variable('GITHUB_INSTALLATION_ID',
             "To use GitHub App integration, you need to define a 'GITHUB_INSTALLATION_ID' in your .env file")
 
+    @property
+    def LLM_PROVIDER(self):
+        return os.getenv('LLM_PROVIDER', 'openai')
+
+    @property
+    def OPENAI_API_KEY(self):
+        return self._get_env_variable('OPENAI_API_KEY')
+
+    @property
+    def GOOGLE_API_KEY(self):
+        return self._get_env_variable('GOOGLE_API_KEY')
+
+    @property
+    def CLAUDE_API_KEY(self):
+        return self._get_env_variable('CLAUDE_API_KEY')
+
 # Initialize the Settings class and expose an instance
 settings = Settings()
 

diff --git a/app/exporters.py b/app/exporters.py
@@ -198,7 +198,7 @@ def _create_with_metadata(self, transcript: Transcript, **kwargs) -> str:
 
         Args:
             transcript: The transcript to export
-            **kwargs: Additional parameters like review_flag
+            **kwargs: Additional parameters like review_flag and content_key
 
         Returns:
             The complete Markdown content with metadata
@@ -215,6 +215,13 @@ def increase_indent(self, flow=False, indentless=False):
         # Get metadata from the source
         metadata = transcript.source.to_json()
 
+        # Determine which content to use
+        content_key = kwargs.get("content_key", "corrected_text")
+        content = transcript.outputs.get(content_key, transcript.outputs.get("raw"))
+
+        if content is None:
+            raise Exception(f"No transcript content found for key '{content_key}' or 'raw'")
+
         # Add or modify specific fields
         if self.transcript_by:
             review_flag = kwargs.get("review_flag", "")
@@ -312,32 +319,40 @@ def export(self, transcript: Transcript, **kwargs) -> str:
         Args:
             transcript: The transcript to export
             add_timestamp: Whether to add a timestamp to the filename (default: False)
+            content_key: The key in transcript.outputs to use for the content (default: "raw")
+            suffix: A suffix to add to the filename (e.g., "_raw")
             **kwargs: Additional parameters (unused)
 
         Returns:
             The path to the exported text file
         """
         self.logger.debug("Exporting transcript to plain text...")
 
-        if transcript.outputs["raw"] is None:
-            raise Exception("No transcript content found")
+        content_key = kwargs.get("content_key", "raw")
+        content = transcript.outputs.get(content_key)
+        if content is None and content_key == "summary":
+            content = transcript.summary
+
+        if content is None:
+            raise Exception(f"No content found for key: {content_key}")
 
         # Get parameters
         add_timestamp = kwargs.get("add_timestamp", False)
+        suffix = kwargs.get("suffix", "")
 
         # Get output directory
         output_dir = self.get_output_path(transcript)
 
         # Construct file path
         file_path = self.construct_file_path(
             directory=output_dir,
-            filename=transcript.title,
+            filename=f"{transcript.title}{suffix}",
             file_type="txt",
             include_timestamp=add_timestamp,
         )
 
         # Write to file
-        result_path = self.write_to_file(transcript.outputs["raw"], file_path)
+        result_path = self.write_to_file(content, file_path)
 
         self.logger.info(f"(exporter) Text file written to: {result_path}")
         return result_path

diff --git a/app/github_api_handler.py b/app/github_api_handler.py
@@ -93,13 +93,17 @@ def create_branch(self, repo_type, branch_name, sha):
         response = self._make_request('POST', url, json=data)
         return response.json()
 
-    def create_or_update_file(self, repo_type, file_path, content, commit_message, branch):
+    def create_or_update_file(self, repo_type, file_path, content, commit_message, branch, get_sha=False):
         url = f"https://api.github.com/repos/{self.repos[repo_type]['owner']}/{self.repos[repo_type]['name']}/contents/{quote(file_path)}"
         data = {
             "message": commit_message,
             "content": base64.b64encode(content.encode()).decode(),
             "branch": branch
         }
+        if get_sha:
+            response = self._make_request('GET', url + f'?ref={branch}')
+            data['sha'] = response.json()['sha']
+
         response = self._make_request('PUT', url, json=data)
         return response.json()
 
@@ -114,23 +118,34 @@ def create_pull_request(self, repo_type, title, head, base, body):
         response = self._make_request('POST', url, json=data)
         return response.json()
 
-    def push_transcripts(self, transcripts: list[Transcript]) -> str | None:
+    def push_transcripts(self, transcripts: list[Transcript], markdown_exporter) -> str | None:
         try:
             default_branch = self.get_default_branch('transcripts')
             branch_sha = self.get_branch_sha('transcripts', default_branch)
-            branch_name = f"transcripts-{''.join(random.choices('0123456789', k=6))}"
+            branch_name = f"transcripts-{'' .join(random.choices('0123456789', k=6))}"
             self.create_branch('transcripts', branch_name, branch_sha)
 
             for transcript in transcripts:
-                if transcript.outputs and transcript.outputs['markdown']:
-                    with open(transcript.outputs['markdown'], 'r') as file:
-                        content = file.read()
+                # First commit: Raw transcript
+                raw_content = markdown_exporter._create_with_metadata(transcript, content_key='raw')
+                self.create_or_update_file(
+                    'transcripts',
+                    transcript.output_path_with_title + ".md",
+                    raw_content,
+                    f'ai(transcript): "{transcript.title}" (raw)',
+                    branch_name
+                )
+
+                # Second commit: Corrected transcript
+                if transcript.outputs.get('corrected_text'):
+                    corrected_content = markdown_exporter._create_with_metadata(transcript, content_key='corrected_text')
                     self.create_or_update_file(
                         'transcripts',
-                        transcript.output_path_with_title,
-                        content,
-                        f'ai(transcript): "{transcript.title}" ({transcript.source.loc})',
-                        branch_name
+                        transcript.output_path_with_title + ".md",
+                        corrected_content,
+                        f'ai(transcript): "{transcript.title}" (corrected)',
+                        branch_name,
+                        get_sha=True # We need the SHA of the file to update it
                     )
 
             pr = self.create_pull_request(

diff --git a/app/services/correction.py b/app/services/correction.py
@@ -0,0 +1,129 @@
+from app.transcript import Transcript
+from app.logging import get_logger
+from app.services.global_tag_manager import GlobalTagManager
+import openai
+from app.config import settings
+
+logger = get_logger()
+
+class CorrectionService:
+    def __init__(self, provider='openai', model='gpt-4o'):
+        self.provider = provider
+        self.model = model
+        self.tag_manager = GlobalTagManager()
+        if self.provider == 'openai':
+            self.client = openai
+            self.client.api_key = settings.OPENAI_API_KEY
+        else:
+            raise ValueError(f"Unsupported LLM provider: {provider}")
+
+    def process(self, transcript: Transcript, **kwargs):
+        logger.info(f"Correcting transcript with {self.provider}...")
+        keywords = kwargs.get('keywords', [])
+
+        metadata = transcript.source.to_json()
+        global_context = self.tag_manager.get_correction_context()
+
+        prompt = self._build_enhanced_prompt(transcript.outputs['raw'], keywords, metadata, global_context)
+
+        response = self.client.chat.completions.create(
+            model=self.model,
+            messages=[{"role": "user", "content": prompt}]
+        )
+        corrected_text = response.choices[0].message.content
+
+        transcript.outputs['corrected_text'] = corrected_text
+        logger.info("Correction complete.")
+
+    def _build_enhanced_prompt(self, text, keywords, metadata, global_context):
+        prompt = (
+            "You are a transcript correction specialist with expertise in Bitcoin and blockchain terminology.\n\n"
+            "The following transcript was generated by automatic speech recognition (ASR). Your task is to "
+            "correct ONLY the obvious mistakes while keeping the transcript as close to the original as possible.\n\n"
+            "DO NOT:\n"
+            "- Rephrase or rewrite sentences\n"
+            "- Change the speaker's style or tone\n"
+            "- Add or remove content\n"
+            "- Make major structural changes\n\n"
+            "DO:\n"
+            "- Fix spelling errors and typos\n"
+            "- Correct misheard words using context\n"
+            "- Fix technical terminology and proper names\n"
+            "- Maintain the exact same flow and structure\n\n"
+            "--- Current Video Metadata ---\n"
+        )
+
+        if metadata.get('title'):
+            prompt += f"Video Title: {metadata['title']}\n"
+        if metadata.get('speakers'):
+            prompt += f"Speakers: {', '.join(metadata['speakers'])}\n"
+        if metadata.get('tags'):
+            prompt += f"Video Tags: {', '.join(metadata['tags'])}\n"
+        if metadata.get('categories'):
+            prompt += f"Categories: {', '.join(metadata['categories'])}\n"
+        if metadata.get('youtube', {}).get('description'):
+            description = metadata['youtube']['description'][:200] + "..." if len(metadata['youtube']['description']) > 200 else metadata['youtube']['description']
+            prompt += f"Description: {description}\n"
+
+        video_count = global_context.get('video_count', 0)
+        prompt += f"\n--- Global Bitcoin Knowledge Base (From {video_count} Transcripts) ---\n"
+
+        if global_context.get('frequent_tags'):
+            frequent_tags = global_context['frequent_tags'][:15]
+            prompt += f"Most Common Topics: {', '.join(frequent_tags)}\n"
+
+        if global_context.get('technical_terms'):
+            tech_terms = global_context['technical_terms'][:20]
+            prompt += f"Technical Terms to Recognize: {', '.join(tech_terms)}\n"
+
+        if global_context.get('project_names'):
+            projects = global_context['project_names'][:15]
+            prompt += f"Bitcoin Projects/Tools: {', '.join(projects)}\n"
+
+        if global_context.get('common_speakers'):
+            speakers = global_context['common_speakers'][:10]
+            prompt += f"Frequent Speakers: {', '.join(speakers)}\n"
+
+        if global_context.get('common_categories'):
+            categories = global_context['common_categories'][:8]
+            prompt += f"Common Content Categories: {', '.join(categories)}\n"
+
+        if global_context.get('expertise_areas'):
+            areas = global_context['expertise_areas'][:8]
+            prompt += f"Domain Expertise Areas: {', '.join(areas)}\n"
+
+        if global_context.get('domain_context'):
+            prompt += f"Primary Domain Focus: {global_context['domain_context']}\n"
+
+        prompt += "\n--- Focus Areas for Correction ---\n"
+        prompt += "Using the metadata and global knowledge, focus on correcting:\n"
+        prompt += "1. Technical terms (ensure proper spelling and capitalization)\n"
+        prompt += "2. Speaker names and project names (match known variations)\n"
+        prompt += "3. Common ASR mishears (but, bit, big -> Bitcoin when context suggests it)\n"
+        prompt += "4. Homophones and similar-sounding words in Bitcoin context\n"
+        prompt += "5. Obvious typos and spelling mistakes\n\n"
+        prompt += "IMPORTANT: Make minimal changes - only fix clear errors, don't improve the text.\n"
+
+        if global_context.get('tag_variations'):
+            variations = global_context['tag_variations']
+            if variations:
+                prompt += "\n--- Common Term Variations ---\n"
+                for base_term, variants in list(variations.items())[:5]:
+                    prompt += f"{base_term}: {', '.join(variants)}\n"
+
+        if keywords:
+            prompt += (
+                "\n--- Additional Priority Keywords ---\n"
+                "Pay special attention to these terms and ensure correct spelling/formatting:\n- "
+            )
+            prompt += "\n- ".join(keywords)
+
+        prompt += f"\n\n--- Transcript Start ---\n\n{text.strip()}\n\n--- Transcript End ---\n\n"
+        prompt += "Return ONLY the corrected transcript. Make minimal changes - fix only obvious errors while "
+        prompt += "preserving the original wording, sentence structure, and speaker's natural expression."
+
+        return prompt
+
+    def _build_prompt(self, text, keywords, metadata):
+        """Legacy method for backward compatibility"""
+        return self._build_enhanced_prompt(text, keywords, metadata, {})