14
总安装量
12
周安装量
#23731
全站排名
安装命令
npx skills add https://github.com/leastbit/claude_skills_zh-cn --skill pdf
Agent 安装分布
claude-code
9
gemini-cli
8
codex
8
cursor
8
opencode
8
antigravity
7
Skill 文档
PDF å¤çæå
æ¦è¿°
æ¬æåæ¶µçä½¿ç¨ Python åºåå½ä»¤è¡å·¥å ·è¿è¡çåºæ¬ PDF å¤çæä½ãæå ³é«çº§åè½ãJavaScript åºå详ç»ç¤ºä¾ï¼è¯·åé reference.mdã妿éè¦å¡«å PDF 表åï¼è¯·é 读 forms.md å¹¶æç §å ¶è¯´ææä½ã
å¿«éå¼å§
from pypdf import PdfReader, PdfWriter
# 读å PDF
reader = PdfReader("document.pdf")
print(f"页æ°: {len(reader.pages)}")
# æåææ¬
text = ""
for page in reader.pages:
text += page.extract_text()
Python åº
pypdf – åºæ¬æä½
åå¹¶ PDF
from pypdf import PdfWriter, PdfReader
writer = PdfWriter()
for pdf_file in ["doc1.pdf", "doc2.pdf", "doc3.pdf"]:
reader = PdfReader(pdf_file)
for page in reader.pages:
writer.add_page(page)
with open("merged.pdf", "wb") as output:
writer.write(output)
æå PDF
reader = PdfReader("input.pdf")
for i, page in enumerate(reader.pages):
writer = PdfWriter()
writer.add_page(page)
with open(f"page_{i+1}.pdf", "wb") as output:
writer.write(output)
æåå æ°æ®
reader = PdfReader("document.pdf")
meta = reader.metadata
print(f"æ é¢: {meta.title}")
print(f"ä½è
: {meta.author}")
print(f"主é¢: {meta.subject}")
print(f"å建è
: {meta.creator}")
æè½¬é¡µé¢
reader = PdfReader("input.pdf")
writer = PdfWriter()
page = reader.pages[0]
page.rotate(90) # 顺æ¶éæè½¬90度
writer.add_page(page)
with open("rotated.pdf", "wb") as output:
writer.write(output)
pdfplumber – ææ¬åè¡¨æ ¼æå
æå带å¸å±çææ¬
import pdfplumber
with pdfplumber.open("document.pdf") as pdf:
for page in pdf.pages:
text = page.extract_text()
print(text)
æåè¡¨æ ¼
with pdfplumber.open("document.pdf") as pdf:
for i, page in enumerate(pdf.pages):
tables = page.extract_tables()
for j, table in enumerate(tables):
print(f"第 {i+1} 页çè¡¨æ ¼ {j+1}:")
for row in table:
print(row)
é«çº§è¡¨æ ¼æå
import pandas as pd
with pdfplumber.open("document.pdf") as pdf:
all_tables = []
for page in pdf.pages:
tables = page.extract_tables()
for table in tables:
if table: # æ£æ¥è¡¨æ ¼æ¯å¦ä¸ºç©º
df = pd.DataFrame(table[1:], columns=table[0])
all_tables.append(df)
# åå¹¶ææè¡¨æ ¼
if all_tables:
combined_df = pd.concat(all_tables, ignore_index=True)
combined_df.to_excel("extracted_tables.xlsx", index=False)
reportlab – å建 PDF
åºæ¬ PDF å建
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
c = canvas.Canvas("hello.pdf", pagesize=letter)
width, height = letter
# æ·»å ææ¬
c.drawString(100, height - 100, "Hello World!")
c.drawString(100, height - 120, "è¿æ¯ç¨ reportlab å建ç PDF")
# æ·»å 线æ¡
c.line(100, height - 140, 400, height - 140)
# ä¿å
c.save()
å建å¤é¡µ PDF
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
from reportlab.lib.styles import getSampleStyleSheet
doc = SimpleDocTemplate("report.pdf", pagesize=letter)
styles = getSampleStyleSheet()
story = []
# æ·»å å
容
title = Paragraph("æ¥åæ é¢", styles['Title'])
story.append(title)
story.append(Spacer(1, 12))
body = Paragraph("è¿æ¯æ¥åçæ£æå
容ã" * 20, styles['Normal'])
story.append(body)
story.append(PageBreak())
# 第2页
story.append(Paragraph("第2页", styles['Heading1']))
story.append(Paragraph("第2页çå
容", styles['Normal']))
# æå»º PDF
doc.build(story)
å½ä»¤è¡å·¥å ·
pdftotext (poppler-utils)
# æåææ¬
pdftotext input.pdf output.txt
# æåææ¬å¹¶ä¿çå¸å±
pdftotext -layout input.pdf output.txt
# æåæå®é¡µé¢
pdftotext -f 1 -l 5 input.pdf output.txt # 第1-5页
qpdf
# åå¹¶ PDF
qpdf --empty --pages file1.pdf file2.pdf -- merged.pdf
# æå页é¢
qpdf input.pdf --pages . 1-5 -- pages1-5.pdf
qpdf input.pdf --pages . 6-10 -- pages6-10.pdf
# æè½¬é¡µé¢
qpdf input.pdf output.pdf --rotate=+90:1 # å°ç¬¬1页æè½¬90度
# ç§»é¤å¯ç
qpdf --password=mypassword --decrypt encrypted.pdf decrypted.pdf
pdftkï¼å¦æå¯ç¨ï¼
# åå¹¶
pdftk file1.pdf file2.pdf cat output merged.pdf
# æå
pdftk input.pdf burst
# æè½¬
pdftk input.pdf rotate 1east output rotated.pdf
常è§ä»»å¡
仿«æç PDF æåææ¬
# éè¦å®è£
: pip install pytesseract pdf2image
import pytesseract
from pdf2image import convert_from_path
# å° PDF 转æ¢ä¸ºå¾å
images = convert_from_path('scanned.pdf')
# 对æ¯ä¸é¡µè¿è¡ OCR è¯å«
text = ""
for i, image in enumerate(images):
text += f"第 {i+1} 页:\n"
text += pytesseract.image_to_string(image)
text += "\n\n"
print(text)
æ·»å æ°´å°
from pypdf import PdfReader, PdfWriter
# å建水å°ï¼æå è½½ç°æçï¼
watermark = PdfReader("watermark.pdf").pages[0]
# åºç¨å°ææé¡µé¢
reader = PdfReader("document.pdf")
writer = PdfWriter()
for page in reader.pages:
page.merge_page(watermark)
writer.add_page(page)
with open("watermarked.pdf", "wb") as output:
writer.write(output)
æåå¾å
# ä½¿ç¨ pdfimages (poppler-utils)
pdfimages -j input.pdf output_prefix
# è¿ä¼å°ææå¾åæå为 output_prefix-000.jpgãoutput_prefix-001.jpg ç
å¯ç ä¿æ¤
from pypdf import PdfReader, PdfWriter
reader = PdfReader("input.pdf")
writer = PdfWriter()
for page in reader.pages:
writer.add_page(page)
# æ·»å å¯ç
writer.encrypt("userpassword", "ownerpassword")
with open("encrypted.pdf", "wb") as output:
writer.write(output)
å¿«éåè
| ä»»å¡ | æä½³å·¥å · | å½ä»¤/代ç |
|---|---|---|
| åå¹¶ PDF | pypdf | writer.add_page(page) |
| æå PDF | pypdf | æ¯é¡µä¸ä¸ªæä»¶ |
| æåææ¬ | pdfplumber | page.extract_text() |
| æåè¡¨æ ¼ | pdfplumber | page.extract_tables() |
| å建 PDF | reportlab | Canvas æ Platypus |
| å½ä»¤è¡åå¹¶ | qpdf | qpdf --empty --pages ... |
| OCR æ«æ PDF | pytesseract | å 转æ¢ä¸ºå¾å |
| å¡«å PDF 表å | pdf-lib æ pypdfï¼åè§ forms.mdï¼ | åè§ forms.md |
åç»æ¥éª¤
- æå ³ pypdfium2 çé«çº§ç¨æ³ï¼è¯·åé reference.md
- æå ³ JavaScript åºï¼pdf-libï¼ï¼è¯·åé reference.md
- 妿éè¦å¡«å PDF 表åï¼è¯·æç § forms.md ä¸ç说ææä½
- æå ³æ éæé¤æåï¼è¯·åé reference.md