完善ocr识别内容,增加python脚本文件

master
tyc 6 months ago
parent 4de8e64451
commit 64cfbd851e
  1. 217
      script/python/paddleocr_service.py
  2. 2
      yudao-server/src/main/java/cn/iocoder/yudao/server/service/OcrProcessingService.java
  3. 2
      yudao-server/src/main/java/cn/iocoder/yudao/server/service/SupervisionService.java
  4. 44
      yudao-server/src/main/java/cn/iocoder/yudao/server/service/UnifiedSocialCreditService.java
  5. 6
      yudao-server/src/main/resources/application-local.yaml

@ -0,0 +1,217 @@
import os
import tempfile
import logging
import time
from fastapi import FastAPI, HTTPException, status, Body
from pydantic import BaseModel
import fitz # PyMuPDF
from paddleocr import PaddleOCR
from typing import Optional, Dict, Any
# 日志配置保持不变
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
handlers=[
logging.FileHandler('/var/log/ocr_service.log', encoding='utf-8'),
logging.StreamHandler()
]
)
logger = logging.getLogger('OCRService')
logger.setLevel(logging.DEBUG)
# 安全配置保持不变
ALLOWED_EXTENSIONS = {'pdf', 'jpg', 'jpeg', 'png'}
MAX_FILE_SIZE = 100 * 1024 * 1024 # 100MB
BASE_STORAGE = '/data/files'
# OCR引擎配置保持不变
ocr_engine = PaddleOCR(
use_angle_cls=True,
lang='ch',
enable_mkldnn=True,
det_limit_side_len=2200,
det_db_box_thresh=0.4
)
app = FastAPI()
class OCRRequest(BaseModel):
file_path: str
class OCRResponse(BaseModel):
status: str
data: Optional[Dict[str, Any]] = None
message: Optional[str] = None
code: Optional[int] = None
elapsed: str
class OCRServiceError(Exception):
def __init__(self, message: str, status_code: int = 400):
self.message = message
self.status_code = status_code
# 路径解析和验证函数保持不变
def safe_resolve_path(user_path: str) -> str:
try:
clean_path = os.path.normpath(user_path).lstrip('/')
if not clean_path:
raise ValueError("空路径参数")
abs_path = os.path.abspath(os.path.join(BASE_STORAGE, clean_path))
if not abs_path.startswith(BASE_STORAGE):
raise ValueError("非法路径访问")
return abs_path
except Exception as e:
logger.error(f"路径解析失败: {str(e)}")
raise OCRServiceError("无效文件路径") from e
def validate_file(file_path: str) -> str:
try:
if not os.path.exists(file_path):
raise OCRServiceError("文件不存在")
if not os.path.isfile(file_path):
raise OCRServiceError("路径不是文件")
if os.path.getsize(file_path) > MAX_FILE_SIZE:
raise OCRServiceError(f"文件超过大小限制 ({MAX_FILE_SIZE//1024//1024}MB)")
_, ext_with_dot = os.path.splitext(file_path)
ext = ext_with_dot.lstrip('.').lower()
if ext not in ALLOWED_EXTENSIONS:
raise OCRServiceError(f"不支持的文件类型: {ext}")
return ext
except OCRServiceError:
raise
except Exception as e:
logger.error(f"文件验证异常: {str(e)}")
raise OCRServiceError("文件验证失败") from e
# 结果格式化和处理函数保持不变
def format_ocr_result(raw_data):
results = []
try:
for group in raw_data[0]:
boxes, (text, score) = group
logger.error(f"识别内容: {str(text)}")
results.append({
"text": text,
"score": float(score),
"boxes": [list(map(float, point)) for point in boxes]
})
return results
except Exception as e:
logger.error("解析失败:%s", str(e), exc_info=True)
return []
def process_pdf(file_path: str):
try:
logger.info(f"开始处理PDF: {file_path}")
doc = fitz.open(file_path)
if doc.is_encrypted:
if not doc.authenticate(""):
raise OCRServiceError("加密PDF需要密码")
pages = []
for page_num in range(len(doc)):
page_start = time.time()
page = doc.load_page(page_num)
zoom = 900 / 72
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(
matrix=mat,
colorspace=fitz.csGRAY,
alpha=False,
dpi=900
)
with tempfile.NamedTemporaryFile(suffix=".png") as tmp:
pix.save(tmp.name)
page_result = ocr_engine.ocr(tmp.name, cls=True)
pages.append({
"page": page_num + 1,
"content": format_ocr_result(page_result),
"process_time": f"{time.time() - page_start:.2f}s"
})
return {
"type": "pdf",
"page_count": len(doc),
"pages": pages
}
except OCRServiceError:
raise
except Exception as e:
logger.error(f"PDF处理异常: {str(e)}")
raise OCRServiceError("PDF处理失败") from e
def process_image(file_path: str):
try:
logger.info(f"开始处理图像: {file_path}")
start_time = time.time()
result = ocr_engine.ocr(file_path, cls=True)
logger.debug("原始OCR数据结构类型: %s", type(result))
formatted = format_ocr_result(result)
return {
"type": "image",
"results": formatted
}
except Exception as e:
logger.error("图像处理异常")
raise
@app.exception_handler(OCRServiceError)
async def ocr_exception_handler(request, exc: OCRServiceError):
return JSONResponse(
status_code=exc.status_code,
content={
"status": "error",
"message": exc.message,
"code": exc.status_code,
"elapsed": "0.00s"
}
)
@app.post("/ocr", response_model=OCRResponse)
async def ocr_service(request_data: OCRRequest = Body(...)):
start_time = time.time()
response = {"status": "success", "data": None, "message": None, "code": None}
try:
abs_path = safe_resolve_path(request_data.file_path)
logger.debug(f"处理请求文件: {abs_path}")
ext = validate_file(abs_path)
if ext == 'pdf':
result = process_pdf(abs_path)
else:
result = process_image(abs_path)
response["data"] = result
except OCRServiceError as e:
raise e
except Exception as e:
logger.error(f"系统异常: {str(e)}", exc_info=True)
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="系统内部错误"
)
finally:
response["elapsed"] = f"{time.time() - start_time:.2f}s"
return response
@app.get("/healthcheck", response_model=Dict[str, Any])
async def health_check():
return {
"status": "ok",
"timestamp": time.time(),
"service": "OCR"
}
if __name__ == "__main__":
import uvicorn
os.makedirs(BASE_STORAGE, exist_ok=True)
uvicorn.run(app, host="0.0.0.0", port=5000)

@ -17,7 +17,7 @@ import java.util.stream.Collectors;
@Service
public class OcrProcessingService {
private static final String OCR_SERVICE_URL = "http://127.0.0.1:5000/ocr";
private static final String OCR_SERVICE_URL = "http://192.168.130.192:5000/ocr";
@Autowired
private IdCardInfoService idCardInfoService;
@Autowired

@ -40,7 +40,7 @@ import sun.misc.BASE64Encoder;
@Service
public class SupervisionService {
private static final Logger LOGGER = LoggerFactory.getLogger(SupervisionService.class);
private static final String UNIFIED_REQUEST_URL = "http://127.0.0.1:8080/table-recognition";
private static final String UNIFIED_REQUEST_URL = "http://192.168.130.192:8080/table-recognition";
private static final Gson GSON = new Gson();
private final ReentrantLock lock = new ReentrantLock();

@ -1,5 +1,6 @@
package cn.iocoder.yudao.server.service;
import cn.hutool.core.collection.CollUtil;
import org.springframework.stereotype.Service;
import java.util.*;
@ -46,13 +47,52 @@ public class UnifiedSocialCreditService {
public Map<String, Object> extractBusinessLicenseInfo(List<String> texts) {
Map<String, Object> data = new HashMap<>();
// 提取公司名称
String mergedText = preprocess(texts);
List<String> resultList = filterBusinessScope(texts);
String mergedText = preprocess(resultList);
extractCompanies(mergedText, data);
extractCode(mergedText, data);
extractAddress(mergedText, data);
return data;
}
/**
* 过滤掉 list "经营范围" 及其符合条件的下一项内容
*
* @param originalList 原始字符串列表
* @return 处理后的列表
*/
public static List<String> filterBusinessScope(List<String> originalList) {
if (CollUtil.isEmpty(originalList)) {
return CollUtil.newArrayList();
}
List<String> list = CollUtil.newArrayList(originalList);
for (int i = 0; i < list.size(); i++) {
if ("经营范围".equals(list.get(i))) {
list.remove(i);
// 检查是否还有下一项
if (i < list.size()) {
String nextItem = list.get(i);
// 判断是否不包含 “号”、“室”、“路”、“村”
if (!containsAnyKeyword(nextItem, "号", "室", "路", "村")) {
list.remove(i); // 不包含则移除
i--; // 删除后索引回退
}
}
i--; // 继续检查当前位置的新元素
}
}
return list;
}
/**
* 判断字符串中是否包含任意一个关键字
*/
private static boolean containsAnyKeyword(String str, String... keywords) {
for (String keyword : keywords) {
if (str.contains(keyword)) {
return true;
}
}
return false;
}
private static void extractCompanies(String text, Map<String, Object> data) {
Matcher m = COMPANY_PATTERN.matcher(text);
while (m.find()) {

@ -48,7 +48,7 @@ spring:
primary: master
datasource:
master:
url: jdbc:mysql://127.0.0.1:3306/lq_ocr?useSSL=false&serverTimezone=Asia/Shanghai&allowPublicKeyRetrieval=true&nullCatalogMeansCurrent=true&rewriteBatchedStatements=true # MySQL Connector/J 8.X 连接的示例
url: jdbc:mysql://192.168.130.153:3306/lq_ocr?useSSL=false&serverTimezone=Asia/Shanghai&allowPublicKeyRetrieval=true&nullCatalogMeansCurrent=true&rewriteBatchedStatements=true # MySQL Connector/J 8.X 连接的示例
# url: jdbc:mysql://127.0.0.1:3306/ruoyi-vue-pro?useSSL=true&allowPublicKeyRetrieval=true&useUnicode=true&characterEncoding=UTF-8&serverTimezone=Asia/Shanghai&rewriteBatchedStatements=true # MySQL Connector/J 5.X 连接的示例
# url: jdbc:postgresql://127.0.0.1:5432/ruoyi-vue-pro # PostgreSQL 连接的示例
# url: jdbc:oracle:thin:@127.0.0.1:1521:xe # Oracle 连接的示例
@ -66,7 +66,7 @@ spring:
# password: Yudao@2024 # OpenGauss 连接的示例
slave: # 模拟从库,可根据自己需要修改
lazy: true # 开启懒加载,保证启动速度
url: jdbc:mysql://127.0.0.1:3306/ruoyi-vue-pro?useSSL=false&serverTimezone=Asia/Shanghai&allowPublicKeyRetrieval=true&rewriteBatchedStatements=true&nullCatalogMeansCurrent=true
url: jdbc:mysql://192.168.130.153:3306/ruoyi-vue-pro?useSSL=false&serverTimezone=Asia/Shanghai&allowPublicKeyRetrieval=true&rewriteBatchedStatements=true&nullCatalogMeansCurrent=true
username: root
password: 123456
# tdengine: # IoT 数据库(需要 IoT 物联网再开启噢!)
@ -79,7 +79,7 @@ spring:
# Redis 配置。Redisson 默认的配置足够使用,一般不需要进行调优
redis:
host: 127.0.0.1 # 地址
host: 192.168.130.153 # 地址
port: 6379 # 端口
database: 0 # 数据库索引
password: root # 密码,建议生产环境开启

Loading…
Cancel
Save