ocr with paddleocr
1
总安装量
0
周安装量
#42024
全站排名
安装命令
npx skills add https://github.com/amnadtaowsoam/cerebraskills --skill OCR with PaddleOCR
Skill 文档
OCR with PaddleOCR
Overview
PaddleOCR is a powerful, open-source OCR toolkit that supports multi-language text recognition, table recognition, and document layout analysis. This skill covers implementation patterns for various document processing scenarios.
Prerequisites
- Python 3.8+: Required for PaddlePaddle and PaddleOCR
- PaddlePaddle: Deep learning framework (CPU or GPU version)
- OpenCV: For image preprocessing and manipulation
- NumPy: For array operations
- Image Preprocessing: Understanding of image enhancement techniques
- Deep Learning Basics: Knowledge of neural networks and model inference
Key Concepts
- Detection Model: Locates text regions in images using DBNet
- Recognition Model: Identifies text content using CRNN
- Direction Classifier: Determines text orientation (0°, 90°, 180°, 270°)
- Multi-language Support: Supports 80+ languages with specific models
- Table Recognition: Specialized models for extracting structured table data
- Document Layout Analysis: Identifies document structure (headers, paragraphs, tables, images)
- GPU Acceleration: CUDA support for faster inference
- Model Quantization: INT8 quantization for deployment on edge devices
Implementation Guide
Installation
# CPU version
pip install paddlepaddle paddleocr
# GPU version (CUDA 11.2)
pip install paddlepaddle-gpu paddleocr
# GPU version (CUDA 11.8)
pip install paddlepaddle-gpu==2.5.2.post118 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
Basic Text Recognition
from paddleocr import PaddleOCR
import cv2
# Initialize PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')
# Read image
image_path = 'document.png'
image = cv2.imread(image_path)
# Perform OCR
result = ocr.ocr(image, cls=True)
# Extract text
for idx in range(len(result)):
res = result[idx]
for line in res:
print(line[1][0]) # Text content
Multi-language OCR
from paddleocr import PaddleOCR
# Supported languages: 'ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka', 'ca', 'hi'
# English
ocr_en = PaddleOCR(use_angle_cls=True, lang='en')
# Chinese
ocr_ch = PaddleOCR(use_angle_cls=True, lang='ch')
# Thai
ocr_th = PaddleOCR(use_angle_cls=True, lang='th')
# Korean
ocr_kr = PaddleOCR(use_angle_cls=True, lang='korean')
# Custom language model
ocr_custom = PaddleOCR(
use_angle_cls=True,
lang='en',
det_model_dir='./custom_det/',
rec_model_dir='./custom_rec/',
cls_model_dir='./custom_cls/'
)
Table Recognition
from paddleocr import PaddleOCR
import cv2
# Initialize with table recognition
ocr = PaddleOCR(
use_angle_cls=True,
lang='en',
table=True, # Enable table recognition
show_log=True
)
# Read image with table
image = cv2.imread('table.png')
# Perform table OCR
result = ocr.ocr(image, cls=True)
# Extract table data
for idx in range(len(result)):
res = result[idx]
for line in res:
bbox, (text, confidence) = line
print(f"Text: {text}, Confidence: {confidence:.2f}")
Document Layout Analysis
from paddleocr import PPStructure
# Initialize structure analysis
table_engine = PPStructure(show_log=True)
# Analyze document layout
image_path = 'document.png'
result = table_engine(image_path)
# Process layout results
for region in result:
print(f"Type: {region['type']}")
print(f"Confidence: {region['score']:.2f}")
if region['type'] == 'table':
# Extract table HTML
html = region['res']['html']
print(f"Table HTML: {html}")
elif region['type'] == 'text':
# Extract text
for text_line in region['res']:
print(f"Text: {text_line['text']}")
Batch Processing
from paddleocr import PaddleOCR
import os
import glob
# Initialize OCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')
# Process multiple images
image_dir = 'documents/'
image_files = glob.glob(os.path.join(image_dir, '*.png'))
results = []
for image_file in image_files:
image = cv2.imread(image_file)
result = ocr.ocr(image, cls=True)
results.append({
'file': image_file,
'result': result
})
# Save results
import json
with open('ocr_results.json', 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
GPU Acceleration
from paddleocr import PaddleOCR
# Initialize with GPU support
ocr = PaddleOCR(
use_angle_cls=True,
lang='en',
use_gpu=True, # Enable GPU
gpu_mem=500, # GPU memory in MB
enable_mkldnn=True # Enable MKLDNN acceleration
)
Custom Model Training
# Prepare training data
# Data format: image_path, text_content
# Train custom detection model
!python tools/train.py -c configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det.yml -o Global.pretrained_model=./your_model/best_accuracy
# Train custom recognition model
!python tools/train.py -c configs/rec/ch_PP-OCRv4/ch_PP-OCRv4_rec.yml -o Global.pretrained_model=./your_model/best_accuracy
# Export model for inference
!python tools/export_model.py -c configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det.yml -o Global.pretrained_model=./your_model/best_accuracy Global.save_inference_dir=./inference/det
Result Processing
from paddleocr import PaddleOCR
import cv2
ocr = PaddleOCR(use_angle_cls=True, lang='en')
image = cv2.imread('document.png')
result = ocr.ocr(image, cls=True)
# Extract structured results
def extract_text_results(result):
"""Extract and structure OCR results"""
extracted = []
for idx in range(len(result)):
res = result[idx]
for line in res:
bbox, (text, confidence) = line
# Calculate bounding box
x1 = min([point[0] for point in bbox])
y1 = min([point[1] for point in bbox])
x2 = max([point[0] for point in bbox])
y2 = max([point[1] for point in bbox])
extracted.append({
'text': text,
'confidence': confidence,
'bbox': {
'x1': x1,
'y1': y1,
'x2': x2,
'y2': y2
},
'points': bbox
})
return extracted
# Get structured results
structured_results = extract_text_results(result)
# Sort by Y position (top to bottom)
sorted_results = sorted(structured_results, key=lambda x: x['bbox']['y1'])
# Print results
for item in sorted_results:
print(f"{item['text']} (confidence: {item['confidence']:.2f})")
Visualization
import cv2
import numpy as np
from paddleocr import PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')
image = cv2.imread('document.png')
result = ocr.ocr(image, cls=True)
# Draw bounding boxes
def draw_ocr_results(image, result):
"""Draw OCR results on image"""
image_copy = image.copy()
for idx in range(len(result)):
res = result[idx]
for line in res:
bbox, (text, confidence) = line
# Convert to numpy array
points = np.array(bbox, dtype=np.int32)
# Draw bounding box
color = (0, 255, 0) if confidence > 0.9 else (0, 165, 255)
cv2.polylines(image_copy, [points], True, color, 2)
# Draw text
x, y = bbox[0]
cv2.putText(
image_copy,
f"{text} ({confidence:.2f})",
(int(x), int(y - 10)),
cv2.FONT_HERSHEY_SIMPLEX,
0.5,
color,
1
)
return image_copy
# Visualize
result_image = draw_ocr_results(image, result)
cv2.imwrite('ocr_result.png', result_image)
Best Practices
Performance Optimization
# Use appropriate model size
# PP-OCRv4: Best accuracy, slower
# PP-OCRv4-mobile: Good accuracy, faster
# PP-OCRv4-server: Best accuracy for server deployment
ocr = PaddleOCR(
use_angle_cls=True,
lang='en',
det_algorithm='DB', # Detection algorithm
rec_algorithm='CRNN', # Recognition algorithm
use_tensorrt=True, # Enable TensorRT for faster inference
precision='fp16' # Use FP16 for faster inference
)
Image Preprocessing
import cv2
import numpy as np
from paddleocr import PaddleOCR
def preprocess_image(image_path):
"""Preprocess image for better OCR results"""
image = cv2.imread(image_path)
# Convert to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# Apply denoising
denoised = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
# Apply adaptive thresholding
binary = cv2.adaptiveThreshold(
denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY, 11, 2
)
return binary
# Use preprocessed image
ocr = PaddleOCR(use_angle_cls=True, lang='en')
processed_image = preprocess_image('document.png')
result = ocr.ocr(processed_image, cls=True)
Error Handling
from paddleocr import PaddleOCR
import cv2
def safe_ocr(image_path, ocr):
"""Safe OCR with error handling"""
try:
image = cv2.imread(image_path)
if image is None:
raise ValueError(f"Failed to load image: {image_path}")
result = ocr.ocr(image, cls=True)
return result
except Exception as e:
print(f"OCR error: {e}")
return None
# Initialize OCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')
# Process with error handling
result = safe_ocr('document.png', ocr)
if result:
# Process results
pass
Confidence Thresholding
def filter_by_confidence(result, threshold=0.8):
"""Filter OCR results by confidence threshold"""
filtered = []
for idx in range(len(result)):
res = result[idx]
for line in res:
bbox, (text, confidence) = line
if confidence >= threshold:
filtered.append({
'text': text,
'confidence': confidence,
'bbox': bbox
})
return filtered
# Filter low-confidence results
high_confidence_results = filter_by_confidence(result, threshold=0.8)
Related Skills
- Image Preprocessing – Image enhancement for better OCR accuracy
- Document Parsing – Structured data extraction from documents
- OCR with Tesseract – Alternative OCR engine
- PDF Processing – PDF-specific processing techniques
- Document Ingestion Pipeline – Document loading workflows