Load and chunk documents for Retrieval-Augmented Generation.
npm install rag-document-loader- PDF (
.pdf) - Word (
.docx) - HTML (
.html) - Markdown (
.md) - Text (
.txt) - CSV (
.csv) - JSON (
.json)
import { DocumentLoader, RecursiveTextSplitter } from 'rag-document-loader';
// Load documents
const loader = new DocumentLoader();
const docs = await loader.load('./documents');
// Split into chunks
const splitter = new RecursiveTextSplitter({
chunkSize: 1000,
chunkOverlap: 200,
});
const chunks = await splitter.split(docs);
// Each chunk has:
// - content: string
// - metadata: { source, page, type, ... }// By character count
new CharacterTextSplitter({ chunkSize: 1000 });
// By tokens (for LLMs)
new TokenTextSplitter({ chunkSize: 500, model: 'gpt-4' });
// By semantic similarity
new SemanticTextSplitter({ embeddings: openaiEmbeddings });
// By markdown headers
new MarkdownHeaderSplitter();const loader = new DocumentLoader({
extractMetadata: true,
// Extract: title, author, date, keywords
});MIT