Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -133,4 +133,8 @@ dmypy.json
# Pyre type checker
.pyre/
.username
.idea
.idea
output/
test_feats/
logs/
metadata/
16 changes: 16 additions & 0 deletions app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,22 @@ def GITHUB_INSTALLATION_ID(self):
return self._get_env_variable('GITHUB_INSTALLATION_ID',
"To use GitHub App integration, you need to define a 'GITHUB_INSTALLATION_ID' in your .env file")

@property
def LLM_PROVIDER(self):
return os.getenv('LLM_PROVIDER', 'openai')

@property
def OPENAI_API_KEY(self):
return self._get_env_variable('OPENAI_API_KEY')

@property
def GOOGLE_API_KEY(self):
return self._get_env_variable('GOOGLE_API_KEY')

@property
def CLAUDE_API_KEY(self):
return self._get_env_variable('CLAUDE_API_KEY')

# Initialize the Settings class and expose an instance
settings = Settings()

Expand Down
25 changes: 20 additions & 5 deletions app/exporters.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ def _create_with_metadata(self, transcript: Transcript, **kwargs) -> str:

Args:
transcript: The transcript to export
**kwargs: Additional parameters like review_flag
**kwargs: Additional parameters like review_flag and content_key

Returns:
The complete Markdown content with metadata
Expand All @@ -215,6 +215,13 @@ def increase_indent(self, flow=False, indentless=False):
# Get metadata from the source
metadata = transcript.source.to_json()

# Determine which content to use
content_key = kwargs.get("content_key", "corrected_text")
content = transcript.outputs.get(content_key, transcript.outputs.get("raw"))

if content is None:
raise Exception(f"No transcript content found for key '{content_key}' or 'raw'")

# Add or modify specific fields
if self.transcript_by:
review_flag = kwargs.get("review_flag", "")
Expand Down Expand Up @@ -312,32 +319,40 @@ def export(self, transcript: Transcript, **kwargs) -> str:
Args:
transcript: The transcript to export
add_timestamp: Whether to add a timestamp to the filename (default: False)
content_key: The key in transcript.outputs to use for the content (default: "raw")
suffix: A suffix to add to the filename (e.g., "_raw")
**kwargs: Additional parameters (unused)

Returns:
The path to the exported text file
"""
self.logger.debug("Exporting transcript to plain text...")

if transcript.outputs["raw"] is None:
raise Exception("No transcript content found")
content_key = kwargs.get("content_key", "raw")
content = transcript.outputs.get(content_key)
if content is None and content_key == "summary":
content = transcript.summary

if content is None:
raise Exception(f"No content found for key: {content_key}")

# Get parameters
add_timestamp = kwargs.get("add_timestamp", False)
suffix = kwargs.get("suffix", "")

# Get output directory
output_dir = self.get_output_path(transcript)

# Construct file path
file_path = self.construct_file_path(
directory=output_dir,
filename=transcript.title,
filename=f"{transcript.title}{suffix}",
file_type="txt",
include_timestamp=add_timestamp,
)

# Write to file
result_path = self.write_to_file(transcript.outputs["raw"], file_path)
result_path = self.write_to_file(content, file_path)

self.logger.info(f"(exporter) Text file written to: {result_path}")
return result_path
Expand Down
35 changes: 25 additions & 10 deletions app/github_api_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,13 +93,17 @@ def create_branch(self, repo_type, branch_name, sha):
response = self._make_request('POST', url, json=data)
return response.json()

def create_or_update_file(self, repo_type, file_path, content, commit_message, branch):
def create_or_update_file(self, repo_type, file_path, content, commit_message, branch, get_sha=False):
url = f"https://api.github.com/repos/{self.repos[repo_type]['owner']}/{self.repos[repo_type]['name']}/contents/{quote(file_path)}"
data = {
"message": commit_message,
"content": base64.b64encode(content.encode()).decode(),
"branch": branch
}
if get_sha:
response = self._make_request('GET', url + f'?ref={branch}')
data['sha'] = response.json()['sha']

response = self._make_request('PUT', url, json=data)
return response.json()

Expand All @@ -114,23 +118,34 @@ def create_pull_request(self, repo_type, title, head, base, body):
response = self._make_request('POST', url, json=data)
return response.json()

def push_transcripts(self, transcripts: list[Transcript]) -> str | None:
def push_transcripts(self, transcripts: list[Transcript], markdown_exporter) -> str | None:
try:
default_branch = self.get_default_branch('transcripts')
branch_sha = self.get_branch_sha('transcripts', default_branch)
branch_name = f"transcripts-{''.join(random.choices('0123456789', k=6))}"
branch_name = f"transcripts-{'' .join(random.choices('0123456789', k=6))}"
self.create_branch('transcripts', branch_name, branch_sha)

for transcript in transcripts:
if transcript.outputs and transcript.outputs['markdown']:
with open(transcript.outputs['markdown'], 'r') as file:
content = file.read()
# First commit: Raw transcript
raw_content = markdown_exporter._create_with_metadata(transcript, content_key='raw')
self.create_or_update_file(
'transcripts',
transcript.output_path_with_title + ".md",
raw_content,
f'ai(transcript): "{transcript.title}" (raw)',
branch_name
)

# Second commit: Corrected transcript
if transcript.outputs.get('corrected_text'):
corrected_content = markdown_exporter._create_with_metadata(transcript, content_key='corrected_text')
self.create_or_update_file(
'transcripts',
transcript.output_path_with_title,
content,
f'ai(transcript): "{transcript.title}" ({transcript.source.loc})',
branch_name
transcript.output_path_with_title + ".md",
corrected_content,
f'ai(transcript): "{transcript.title}" (corrected)',
branch_name,
get_sha=True # We need the SHA of the file to update it
)

pr = self.create_pull_request(
Expand Down
129 changes: 129 additions & 0 deletions app/services/correction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
from app.transcript import Transcript
from app.logging import get_logger
from app.services.global_tag_manager import GlobalTagManager
import openai
from app.config import settings

logger = get_logger()

class CorrectionService:
def __init__(self, provider='openai', model='gpt-4o'):
self.provider = provider
self.model = model
self.tag_manager = GlobalTagManager()
if self.provider == 'openai':
self.client = openai
self.client.api_key = settings.OPENAI_API_KEY
else:
raise ValueError(f"Unsupported LLM provider: {provider}")

def process(self, transcript: Transcript, **kwargs):
logger.info(f"Correcting transcript with {self.provider}...")
keywords = kwargs.get('keywords', [])

metadata = transcript.source.to_json()
global_context = self.tag_manager.get_correction_context()

prompt = self._build_enhanced_prompt(transcript.outputs['raw'], keywords, metadata, global_context)

response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}]
)
corrected_text = response.choices[0].message.content

transcript.outputs['corrected_text'] = corrected_text
logger.info("Correction complete.")

def _build_enhanced_prompt(self, text, keywords, metadata, global_context):
prompt = (
"You are a transcript correction specialist with expertise in Bitcoin and blockchain terminology.\n\n"
"The following transcript was generated by automatic speech recognition (ASR). Your task is to "
"correct ONLY the obvious mistakes while keeping the transcript as close to the original as possible.\n\n"
"DO NOT:\n"
"- Rephrase or rewrite sentences\n"
"- Change the speaker's style or tone\n"
"- Add or remove content\n"
"- Make major structural changes\n\n"
"DO:\n"
"- Fix spelling errors and typos\n"
"- Correct misheard words using context\n"
"- Fix technical terminology and proper names\n"
"- Maintain the exact same flow and structure\n\n"
"--- Current Video Metadata ---\n"
)

if metadata.get('title'):
prompt += f"Video Title: {metadata['title']}\n"
if metadata.get('speakers'):
prompt += f"Speakers: {', '.join(metadata['speakers'])}\n"
if metadata.get('tags'):
prompt += f"Video Tags: {', '.join(metadata['tags'])}\n"
if metadata.get('categories'):
prompt += f"Categories: {', '.join(metadata['categories'])}\n"
if metadata.get('youtube', {}).get('description'):
description = metadata['youtube']['description'][:200] + "..." if len(metadata['youtube']['description']) > 200 else metadata['youtube']['description']
prompt += f"Description: {description}\n"

video_count = global_context.get('video_count', 0)
prompt += f"\n--- Global Bitcoin Knowledge Base (From {video_count} Transcripts) ---\n"

if global_context.get('frequent_tags'):
frequent_tags = global_context['frequent_tags'][:15]
prompt += f"Most Common Topics: {', '.join(frequent_tags)}\n"

if global_context.get('technical_terms'):
tech_terms = global_context['technical_terms'][:20]
prompt += f"Technical Terms to Recognize: {', '.join(tech_terms)}\n"

if global_context.get('project_names'):
projects = global_context['project_names'][:15]
prompt += f"Bitcoin Projects/Tools: {', '.join(projects)}\n"

if global_context.get('common_speakers'):
speakers = global_context['common_speakers'][:10]
prompt += f"Frequent Speakers: {', '.join(speakers)}\n"

if global_context.get('common_categories'):
categories = global_context['common_categories'][:8]
prompt += f"Common Content Categories: {', '.join(categories)}\n"

if global_context.get('expertise_areas'):
areas = global_context['expertise_areas'][:8]
prompt += f"Domain Expertise Areas: {', '.join(areas)}\n"

if global_context.get('domain_context'):
prompt += f"Primary Domain Focus: {global_context['domain_context']}\n"

prompt += "\n--- Focus Areas for Correction ---\n"
prompt += "Using the metadata and global knowledge, focus on correcting:\n"
prompt += "1. Technical terms (ensure proper spelling and capitalization)\n"
prompt += "2. Speaker names and project names (match known variations)\n"
prompt += "3. Common ASR mishears (but, bit, big -> Bitcoin when context suggests it)\n"
prompt += "4. Homophones and similar-sounding words in Bitcoin context\n"
prompt += "5. Obvious typos and spelling mistakes\n\n"
prompt += "IMPORTANT: Make minimal changes - only fix clear errors, don't improve the text.\n"

if global_context.get('tag_variations'):
variations = global_context['tag_variations']
if variations:
prompt += "\n--- Common Term Variations ---\n"
for base_term, variants in list(variations.items())[:5]:
prompt += f"{base_term}: {', '.join(variants)}\n"

if keywords:
prompt += (
"\n--- Additional Priority Keywords ---\n"
"Pay special attention to these terms and ensure correct spelling/formatting:\n- "
)
prompt += "\n- ".join(keywords)

prompt += f"\n\n--- Transcript Start ---\n\n{text.strip()}\n\n--- Transcript End ---\n\n"
prompt += "Return ONLY the corrected transcript. Make minimal changes - fix only obvious errors while "
prompt += "preserving the original wording, sentence structure, and speaker's natural expression."

return prompt

def _build_prompt(self, text, keywords, metadata):
"""Legacy method for backward compatibility"""
return self._build_enhanced_prompt(text, keywords, metadata, {})
Loading