pdf-harvester
10
总安装量
8
周安装量
#30196
全站排名
安装命令
npx skills add https://github.com/mindmorass/reflex --skill pdf-harvester
Agent 安装分布
claude-code
7
antigravity
6
trae
6
opencode
6
gemini-cli
6
windsurf
5
Skill 文档
PDF Harvester Skill
Extract and ingest PDF documents into RAG with proper text extraction, table handling, and metadata.
Overview
PDFs are common for research papers, reports, manuals, and ebooks. This skill covers:
- Text extraction with layout preservation
- Table extraction and conversion to markdown
- Academic paper patterns (abstract, sections, citations)
- OCR for scanned documents
- Multi-page chunking strategies
Prerequisites
# Core extraction
pip install pdfplumber pymupdf
# For OCR (scanned documents)
pip install pytesseract pdf2image
# Also need: brew install tesseract poppler (macOS)
# For academic papers
pip install arxiv # If fetching from arXiv
Extraction Methods
Method 1: pdfplumber (Recommended)
Best for structured PDFs with tables.
#!/usr/bin/env python3
"""PDF extraction using pdfplumber."""
import pdfplumber
from pathlib import Path
from typing import Dict, List, Optional
import re
def extract_pdf_text(
pdf_path: str,
extract_tables: bool = True
) -> Dict:
"""
Extract text and tables from PDF.
Args:
pdf_path: Path to PDF file
extract_tables: Whether to extract tables separately
Returns:
Dict with pages, tables, and metadata
"""
result = {
"pages": [],
"tables": [],
"metadata": {},
"total_pages": 0
}
with pdfplumber.open(pdf_path) as pdf:
result["total_pages"] = len(pdf.pages)
result["metadata"] = pdf.metadata or {}
for page_num, page in enumerate(pdf.pages, 1):
# Extract text
text = page.extract_text() or ""
result["pages"].append({
"page_number": page_num,
"text": text,
"width": page.width,
"height": page.height
})
# Extract tables
if extract_tables:
tables = page.extract_tables()
for table_num, table in enumerate(tables, 1):
if table and len(table) > 0:
result["tables"].append({
"page_number": page_num,
"table_number": table_num,
"data": table,
"markdown": table_to_markdown(table)
})
return result
def table_to_markdown(table: List[List]) -> str:
"""Convert table data to markdown format."""
if not table or len(table) == 0:
return ""
# Clean cells
def clean_cell(cell):
if cell is None:
return ""
return str(cell).replace("
", " ").strip()
# Header row
headers = [clean_cell(c) for c in table[0]]
md = "| " + " | ".join(headers) + " |
"
md += "| " + " | ".join(["---"] * len(headers)) + " |
"
# Data rows
for row in table[1:]:
cells = [clean_cell(c) for c in row]
# Pad if necessary
while len(cells) < len(headers):
cells.append("")
md += "| " + " | ".join(cells[:len(headers)]) + " |
"
return md
Method 2: PyMuPDF (fitz)
Faster, better for large PDFs.
#!/usr/bin/env python3
"""PDF extraction using PyMuPDF."""
import fitz # PyMuPDF
from typing import Dict, List
def extract_with_pymupdf(pdf_path: str) -> Dict:
"""
Extract text using PyMuPDF.
Faster than pdfplumber, good for large documents.
"""
doc = fitz.open(pdf_path)
result = {
"pages": [],
"metadata": doc.metadata,
"total_pages": len(doc)
}
for page_num, page in enumerate(doc, 1):
# Get text with layout preservation
text = page.get_text("text")
# Get text blocks for better structure
blocks = page.get_text("dict")["blocks"]
result["pages"].append({
"page_number": page_num,
"text": text,
"blocks": len(blocks)
})
doc.close()
return result
def extract_with_structure(pdf_path: str) -> Dict:
"""Extract with heading detection."""
doc = fitz.open(pdf_path)
pages = []
for page_num, page in enumerate(doc, 1):
blocks = page.get_text("dict")["blocks"]
structured_content = []
for block in blocks:
if block["type"] == 0: # Text block
for line in block.get("lines", []):
for span in line.get("spans", []):
text = span["text"].strip()
font_size = span["size"]
is_bold = "bold" in span["font"].lower()
# Detect headings by font size
if font_size > 14 or is_bold:
structured_content.append({
"type": "heading",
"text": text,
"size": font_size
})
else:
structured_content.append({
"type": "paragraph",
"text": text
})
pages.append({
"page_number": page_num,
"content": structured_content
})
doc.close()
return {"pages": pages, "total_pages": len(pages)}
Method 3: OCR for Scanned PDFs
#!/usr/bin/env python3
"""OCR extraction for scanned PDFs."""
import pytesseract
from pdf2image import convert_from_path
from typing import Dict, List
def extract_with_ocr(
pdf_path: str,
language: str = "eng",
dpi: int = 300
) -> Dict:
"""
Extract text from scanned PDF using OCR.
Args:
pdf_path: Path to PDF
language: Tesseract language code
dpi: Resolution for conversion
"""
# Convert PDF pages to images
images = convert_from_path(pdf_path, dpi=dpi)
pages = []
for page_num, image in enumerate(images, 1):
# Run OCR
text = pytesseract.image_to_string(image, lang=language)
pages.append({
"page_number": page_num,
"text": text,
"ocr": True
})
return {
"pages": pages,
"total_pages": len(pages),
"ocr_used": True
}
def is_scanned_pdf(pdf_path: str) -> bool:
"""Detect if PDF is scanned (image-based)."""
import fitz
doc = fitz.open(pdf_path)
# Check first few pages
for page in doc[:min(3, len(doc))]:
text = page.get_text().strip()
if len(text) > 100: # Has extractable text
doc.close()
return False
doc.close()
return True
Chunking Strategies
Strategy 1: Page-Based
Simple chunking by page boundaries.
def chunk_by_pages(
extracted: Dict,
pages_per_chunk: int = 1
) -> List[Dict]:
"""Chunk PDF by page boundaries."""
chunks = []
pages = extracted["pages"]
for i in range(0, len(pages), pages_per_chunk):
page_group = pages[i:i + pages_per_chunk]
text = "
".join(p["text"] for p in page_group)
chunks.append({
"content": text,
"page_start": page_group[0]["page_number"],
"page_end": page_group[-1]["page_number"],
"chunk_index": len(chunks)
})
return chunks
Strategy 2: Section-Based
Chunk by document sections/headings.
def chunk_by_sections(
extracted: Dict,
heading_patterns: List[str] = None
) -> List[Dict]:
"""Chunk PDF by section headings."""
if heading_patterns is None:
heading_patterns = [
r'^#+\s', # Markdown headings
r'^\d+\.\s+[A-Z]', # Numbered sections
r'^[A-Z][A-Z\s]+$', # ALL CAPS headings
r'^(Abstract|Introduction|Conclusion|References)',
]
full_text = "
".join(p["text"] for p in extracted["pages"])
# Find section boundaries
sections = []
current_section = {"title": "Introduction", "content": "", "start_pos": 0}
lines = full_text.split("
")
for line in lines:
is_heading = any(
re.match(pattern, line.strip())
for pattern in heading_patterns
)
if is_heading and current_section["content"].strip():
sections.append(current_section)
current_section = {
"title": line.strip(),
"content": "",
"start_pos": len(sections)
}
else:
current_section["content"] += line + "
"
# Don't forget last section
if current_section["content"].strip():
sections.append(current_section)
return [
{
"content": s["content"].strip(),
"section": s["title"],
"chunk_index": i
}
for i, s in enumerate(sections)
]
Strategy 3: Semantic Paragraphs
Chunk by paragraph with size limits.
def chunk_by_paragraphs(
extracted: Dict,
max_chunk_size: int = 500, # words
overlap: int = 50
) -> List[Dict]:
"""Chunk by paragraphs with overlap."""
full_text = "
".join(p["text"] for p in extracted["pages"])
# Split into paragraphs
paragraphs = [p.strip() for p in full_text.split("
") if p.strip()]
chunks = []
current_chunk = []
current_size = 0
for para in paragraphs:
para_size = len(para.split())
if current_size + para_size > max_chunk_size and current_chunk:
# Save current chunk
chunks.append({
"content": "
".join(current_chunk),
"chunk_index": len(chunks),
"word_count": current_size
})
# Start new chunk with overlap
overlap_text = current_chunk[-1] if current_chunk else ""
current_chunk = [overlap_text] if overlap_text else []
current_size = len(overlap_text.split()) if overlap_text else 0
current_chunk.append(para)
current_size += para_size
# Last chunk
if current_chunk:
chunks.append({
"content": "
".join(current_chunk),
"chunk_index": len(chunks),
"word_count": current_size
})
return chunks
Academic Paper Pattern
Special handling for research papers.
def extract_academic_paper(pdf_path: str) -> Dict:
"""
Extract academic paper with structure detection.
Identifies: title, authors, abstract, sections, references
"""
extracted = extract_pdf_text(pdf_path)
full_text = "
".join(p["text"] for p in extracted["pages"])
paper = {
"title": "",
"authors": [],
"abstract": "",
"sections": [],
"references": [],
"tables": extracted["tables"]
}
# Title is usually first large text
lines = full_text.split("
")
for line in lines[:10]:
if len(line) > 20 and len(line) < 200:
paper["title"] = line.strip()
break
# Abstract
abstract_match = re.search(
r'Abstract[:\s]*
?(.*?)(?=
(?:1\.?\s+)?Introduction|
[A-Z])',
full_text,
re.DOTALL | re.IGNORECASE
)
if abstract_match:
paper["abstract"] = abstract_match.group(1).strip()
# Sections
section_pattern = r'
(\d+\.?\s+[A-Z][^
]+)
'
section_matches = re.finditer(section_pattern, full_text)
section_positions = [(m.group(1), m.start()) for m in section_matches]
for i, (title, start) in enumerate(section_positions):
end = section_positions[i+1][1] if i+1 < len(section_positions) else len(full_text)
content = full_text[start:end]
paper["sections"].append({
"title": title.strip(),
"content": content.strip()
})
# References section
ref_match = re.search(
r'(?:References|Bibliography)\s*
(.*?)$',
full_text,
re.DOTALL | re.IGNORECASE
)
if ref_match:
paper["references_text"] = ref_match.group(1).strip()
return paper
Full Harvesting Pipeline
#!/usr/bin/env python3
"""Complete PDF harvesting pipeline."""
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional
import hashlib
async def harvest_pdf(
pdf_path: str,
collection: str,
chunk_strategy: str = "paragraphs", # pages, sections, paragraphs
is_academic: bool = False,
use_ocr: bool = False
) -> Dict:
"""
Harvest a PDF document into RAG.
Args:
pdf_path: Path to PDF file
collection: Target RAG collection
chunk_strategy: How to chunk the document
is_academic: Use academic paper extraction
use_ocr: Force OCR extraction
"""
path = Path(pdf_path)
# Check if OCR needed
if use_ocr or is_scanned_pdf(pdf_path):
extracted = extract_with_ocr(pdf_path)
else:
extracted = extract_pdf_text(pdf_path)
# Get document metadata
doc_metadata = {
"source_type": "pdf",
"source_path": str(path.absolute()),
"filename": path.name,
"total_pages": extracted["total_pages"],
"harvested_at": datetime.now().isoformat(),
"pdf_metadata": extracted.get("metadata", {})
}
# Academic paper special handling
if is_academic:
paper = extract_academic_paper(pdf_path)
doc_metadata["title"] = paper["title"]
doc_metadata["abstract"] = paper["abstract"]
doc_metadata["is_academic"] = True
# Chunk based on strategy
if chunk_strategy == "pages":
chunks = chunk_by_pages(extracted)
elif chunk_strategy == "sections":
chunks = chunk_by_sections(extracted)
else:
chunks = chunk_by_paragraphs(extracted)
# Generate document ID from content hash
content_hash = hashlib.md5(
"".join(p["text"] for p in extracted["pages"]).encode()
).hexdigest()[:12]
doc_id = f"pdf_{content_hash}"
# Ingest chunks
ingested = 0
for chunk in chunks:
chunk_metadata = {
**doc_metadata,
"chunk_index": chunk["chunk_index"],
"total_chunks": len(chunks),
}
# Add page info if available
if "page_start" in chunk:
chunk_metadata["page_start"] = chunk["page_start"]
chunk_metadata["page_end"] = chunk["page_end"]
# Add section info if available
if "section" in chunk:
chunk_metadata["section"] = chunk["section"]
await ingest(
content=chunk["content"],
collection=collection,
metadata=chunk_metadata,
doc_id=f"{doc_id}_chunk_{chunk['chunk_index']}"
)
ingested += 1
# Ingest tables separately
for table in extracted.get("tables", []):
table_metadata = {
**doc_metadata,
"content_type": "table",
"page_number": table["page_number"],
"table_number": table["table_number"]
}
await ingest(
content=table["markdown"],
collection=collection,
metadata=table_metadata,
doc_id=f"{doc_id}_table_{table['page_number']}_{table['table_number']}"
)
return {
"status": "success",
"filename": path.name,
"pages": extracted["total_pages"],
"chunks": ingested,
"tables": len(extracted.get("tables", [])),
"collection": collection,
"doc_id": doc_id
}
async def harvest_pdf_url(
url: str,
collection: str,
**kwargs
) -> Dict:
"""Download and harvest a PDF from URL."""
import httpx
import tempfile
# Download PDF
async with httpx.AsyncClient() as client:
response = await client.get(url, follow_redirects=True)
response.raise_for_status()
# Save to temp file
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f:
f.write(response.content)
temp_path = f.name
try:
result = await harvest_pdf(temp_path, collection, **kwargs)
result["source_url"] = url
return result
finally:
Path(temp_path).unlink() # Clean up
Metadata Schema
# PDF chunk metadata
source_type: pdf
source_path: /path/to/document.pdf
source_url: https://... (if downloaded)
filename: document.pdf
total_pages: 45
page_start: 5
page_end: 7
section: "3. Methodology"
chunk_index: 12
total_chunks: 28
harvested_at: "2024-01-01T12:00:00Z"
is_academic: true
title: "Paper Title"
abstract: "Paper abstract..."
content_type: text|table
Usage Examples
# Local PDF
result = await harvest_pdf(
pdf_path="/path/to/document.pdf",
collection="research_papers",
chunk_strategy="sections",
is_academic=True
)
# PDF from URL
result = await harvest_pdf_url(
url="https://arxiv.org/pdf/2301.00001.pdf",
collection="ml_papers",
is_academic=True
)
# Scanned document
result = await harvest_pdf(
pdf_path="/path/to/scanned.pdf",
collection="legacy_docs",
use_ocr=True
)
Refinement Notes
Track improvements as you use this skill.
- Text extraction tested
- Table extraction working
- OCR fallback tested
- Academic paper pattern validated
- Chunking strategies compared
- Large PDF handling optimized