完善ocr识别内容，增加python脚本文件

6 months ago · 64cfbd851e
parent 4de8e64451
commit 64cfbd851e
5 changed files with 264 additions and 7 deletions
--- a/script/python/paddleocr_service.py
+++ b/script/python/paddleocr_service.py
@ -0,0 +1,217 @@
 import os
 import tempfile
 import logging
 import time
 from fastapi import FastAPI, HTTPException, status, Body
 from pydantic import BaseModel
 import fitz  # PyMuPDF
 from paddleocr import PaddleOCR
 from typing import Optional, Dict, Any
 # 日志配置保持不变
 logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(levelname)s] %(message)s',
    handlers=[
        logging.FileHandler('/var/log/ocr_service.log', encoding='utf-8'),
        logging.StreamHandler()
    ]
 )
 logger = logging.getLogger('OCRService')
 logger.setLevel(logging.DEBUG)
 # 安全配置保持不变
 ALLOWED_EXTENSIONS = {'pdf', 'jpg', 'jpeg', 'png'}
 MAX_FILE_SIZE = 100 * 1024 * 1024  # 100MB
 BASE_STORAGE = '/data/files'
 # OCR引擎配置保持不变
 ocr_engine = PaddleOCR(
    use_angle_cls=True,
    lang='ch',
    enable_mkldnn=True,
    det_limit_side_len=2200,
    det_db_box_thresh=0.4
 )
 app = FastAPI()
 class OCRRequest(BaseModel):
    file_path: str
 class OCRResponse(BaseModel):
    status: str
    data: Optional[Dict[str, Any]] = None
    message: Optional[str] = None
    code: Optional[int] = None
    elapsed: str
 class OCRServiceError(Exception):
    def __init__(self, message: str, status_code: int = 400):
        self.message = message
        self.status_code = status_code
 # 路径解析和验证函数保持不变
 def safe_resolve_path(user_path: str) -> str:
    try:
        clean_path = os.path.normpath(user_path).lstrip('/')
        if not clean_path:
            raise ValueError("空路径参数")
        abs_path = os.path.abspath(os.path.join(BASE_STORAGE, clean_path))
        if not abs_path.startswith(BASE_STORAGE):
            raise ValueError("非法路径访问")
        return abs_path
    except Exception as e:
        logger.error(f"路径解析失败: {str(e)}")
        raise OCRServiceError("无效文件路径") from e
 def validate_file(file_path: str) -> str:
    try:
        if not os.path.exists(file_path):
            raise OCRServiceError("文件不存在")
        if not os.path.isfile(file_path):
            raise OCRServiceError("路径不是文件")
        if os.path.getsize(file_path) > MAX_FILE_SIZE:
            raise OCRServiceError(f"文件超过大小限制 ({MAX_FILE_SIZE//1024//1024}MB)")
        _, ext_with_dot = os.path.splitext(file_path)
        ext = ext_with_dot.lstrip('.').lower()
        if ext not in ALLOWED_EXTENSIONS:
            raise OCRServiceError(f"不支持的文件类型: {ext}")
        return ext
    except OCRServiceError:
        raise
    except Exception as e:
        logger.error(f"文件验证异常: {str(e)}")
        raise OCRServiceError("文件验证失败") from e
 # 结果格式化和处理函数保持不变
 def format_ocr_result(raw_data):
    results = []
    try:
        for group in raw_data[0]:
            boxes, (text, score) = group
            logger.error(f"识别内容: {str(text)}")
            results.append({
                "text": text,
                "score": float(score),
                "boxes": [list(map(float, point)) for point in boxes]
            })
        return results
    except Exception as e:
        logger.error("解析失败：%s", str(e), exc_info=True)
        return []
 def process_pdf(file_path: str):
    try:
        logger.info(f"开始处理PDF: {file_path}")
        doc = fitz.open(file_path)
        if doc.is_encrypted:
            if not doc.authenticate(""):
                raise OCRServiceError("加密PDF需要密码")
        pages = []
        for page_num in range(len(doc)):
            page_start = time.time()
            page = doc.load_page(page_num)
            zoom = 900 / 72
            mat = fitz.Matrix(zoom, zoom)
            pix = page.get_pixmap(
                matrix=mat,
                colorspace=fitz.csGRAY,
                alpha=False,
                dpi=900
            )
            with tempfile.NamedTemporaryFile(suffix=".png") as tmp:
                pix.save(tmp.name)
                page_result = ocr_engine.ocr(tmp.name, cls=True)
                pages.append({
                    "page": page_num + 1,
                    "content": format_ocr_result(page_result),
                    "process_time": f"{time.time() - page_start:.2f}s"
                })
        return {
            "type": "pdf",
            "page_count": len(doc),
            "pages": pages
        }
    except OCRServiceError:
        raise
    except Exception as e:
        logger.error(f"PDF处理异常: {str(e)}")
        raise OCRServiceError("PDF处理失败") from e
 def process_image(file_path: str):
    try:
        logger.info(f"开始处理图像: {file_path}")
        start_time = time.time()    
        result = ocr_engine.ocr(file_path, cls=True)       
        logger.debug("原始OCR数据结构类型: %s", type(result))
        formatted = format_ocr_result(result)   
        return {
            "type": "image",
            "results": formatted
        }
    except Exception as e:
        logger.error("图像处理异常")
        raise
@app.exception_handler(OCRServiceError)
 async def ocr_exception_handler(request, exc: OCRServiceError):
    return JSONResponse(
        status_code=exc.status_code,
        content={
            "status": "error",
            "message": exc.message,
            "code": exc.status_code,
            "elapsed": "0.00s"
        }
    )
@app.post("/ocr", response_model=OCRResponse)
 async def ocr_service(request_data: OCRRequest = Body(...)):
    start_time = time.time()
    response = {"status": "success", "data": None, "message": None, "code": None}
    try:
        abs_path = safe_resolve_path(request_data.file_path)
        logger.debug(f"处理请求文件: {abs_path}")
        ext = validate_file(abs_path)
        if ext == 'pdf':
            result = process_pdf(abs_path)
        else:
            result = process_image(abs_path)
        response["data"] = result
    except OCRServiceError as e:
        raise e
    except Exception as e:
        logger.error(f"系统异常: {str(e)}", exc_info=True)
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail="系统内部错误"
        )
    finally:
        response["elapsed"] = f"{time.time() - start_time:.2f}s"
    return response
@app.get("/healthcheck", response_model=Dict[str, Any])
 async def health_check():
    return {
        "status": "ok",
        "timestamp": time.time(),
        "service": "OCR"
    }
 if __name__ == "__main__":
    import uvicorn
    os.makedirs(BASE_STORAGE, exist_ok=True)
    uvicorn.run(app, host="0.0.0.0", port=5000)
--- a/yudao-server/src/main/java/cn/iocoder/yudao/server/service/OcrProcessingService.java
+++ b/yudao-server/src/main/java/cn/iocoder/yudao/server/service/OcrProcessingService.java
@ -17,7 +17,7 @@ import java.util.stream.Collectors;
@Service
 public class OcrProcessingService {
-    private static final String OCR_SERVICE_URL = "http://127.0.0.1:5000/ocr";
+    private static final String OCR_SERVICE_URL = "http://192.168.130.192:5000/ocr";
    @Autowired
    private IdCardInfoService idCardInfoService;
    @Autowired
--- a/yudao-server/src/main/java/cn/iocoder/yudao/server/service/SupervisionService.java
+++ b/yudao-server/src/main/java/cn/iocoder/yudao/server/service/SupervisionService.java
@ -40,7 +40,7 @@ import sun.misc.BASE64Encoder;
@Service
 public class SupervisionService {
    private static final Logger LOGGER = LoggerFactory.getLogger(SupervisionService.class);
-    private static final String UNIFIED_REQUEST_URL = "http://127.0.0.1:8080/table-recognition";
+    private static final String UNIFIED_REQUEST_URL = "http://192.168.130.192:8080/table-recognition";
    private static final Gson GSON = new Gson();
    private final ReentrantLock lock = new ReentrantLock();
--- a/yudao-server/src/main/java/cn/iocoder/yudao/server/service/UnifiedSocialCreditService.java
+++ b/yudao-server/src/main/java/cn/iocoder/yudao/server/service/UnifiedSocialCreditService.java
@ -1,5 +1,6 @@
 package cn.iocoder.yudao.server.service;
 import cn.hutool.core.collection.CollUtil;
 import org.springframework.stereotype.Service;
 import java.util.*;
@ -46,13 +47,52 @@ public class UnifiedSocialCreditService {
    public Map<String, Object> extractBusinessLicenseInfo(List<String> texts) {
        Map<String, Object> data = new HashMap<>();
        // 提取公司名称
-        String mergedText = preprocess(texts);
+        List<String> resultList = filterBusinessScope(texts);
        String mergedText = preprocess(resultList);
        extractCompanies(mergedText, data);
        extractCode(mergedText, data);
        extractAddress(mergedText, data);
        return data;
    }
-
+    /**
     * 过滤掉 list 中 "经营范围" 及其符合条件的下一项内容
     *
     * @param originalList 原始字符串列表
     * @return 处理后的列表
     */
    public static List<String> filterBusinessScope(List<String> originalList) {
        if (CollUtil.isEmpty(originalList)) {
            return CollUtil.newArrayList();
        }
        List<String> list = CollUtil.newArrayList(originalList);
        for (int i = 0; i < list.size(); i++) {
            if ("经营范围".equals(list.get(i))) {
                list.remove(i);
                // 检查是否还有下一项
                if (i < list.size()) {
                    String nextItem = list.get(i);
                    // 判断是否不包含 “号”、“室”、“路”、“村”
                    if (!containsAnyKeyword(nextItem, "号", "室", "路", "村")) {
                        list.remove(i); // 不包含则移除
                        i--; // 删除后索引回退
                    }
                }
                i--; // 继续检查当前位置的新元素
            }
        }
        return list;
    }
    /**
     * 判断字符串中是否包含任意一个关键字
     */
    private static boolean containsAnyKeyword(String str, String... keywords) {
        for (String keyword : keywords) {
            if (str.contains(keyword)) {
                return true;
            }
        }
        return false;
    }
    private static void extractCompanies(String text, Map<String, Object> data) {
        Matcher m = COMPANY_PATTERN.matcher(text);
        while (m.find()) {
--- a/yudao-server/src/main/resources/application-local.yaml
+++ b/yudao-server/src/main/resources/application-local.yaml
@ -48,7 +48,7 @@ spring:
      primary: master
      datasource:
        master:
-          url: jdbc:mysql://127.0.0.1:3306/lq_ocr?useSSL=false&serverTimezone=Asia/Shanghai&allowPublicKeyRetrieval=true&nullCatalogMeansCurrent=true&rewriteBatchedStatements=true # MySQL Connector/J 8.X 连接的示例
+          url: jdbc:mysql://192.168.130.153:3306/lq_ocr?useSSL=false&serverTimezone=Asia/Shanghai&allowPublicKeyRetrieval=true&nullCatalogMeansCurrent=true&rewriteBatchedStatements=true # MySQL Connector/J 8.X 连接的示例
          #          url: jdbc:mysql://127.0.0.1:3306/ruoyi-vue-pro?useSSL=true&allowPublicKeyRetrieval=true&useUnicode=true&characterEncoding=UTF-8&serverTimezone=Asia/Shanghai&rewriteBatchedStatements=true # MySQL Connector/J 5.X 连接的示例
          #          url: jdbc:postgresql://127.0.0.1:5432/ruoyi-vue-pro # PostgreSQL 连接的示例
          #          url: jdbc:oracle:thin:@127.0.0.1:1521:xe # Oracle 连接的示例
@ -66,7 +66,7 @@ spring:
          #          password: Yudao@2024 # OpenGauss 连接的示例
        slave: # 模拟从库，可根据自己需要修改
          lazy: true # 开启懒加载，保证启动速度
-          url: jdbc:mysql://127.0.0.1:3306/ruoyi-vue-pro?useSSL=false&serverTimezone=Asia/Shanghai&allowPublicKeyRetrieval=true&rewriteBatchedStatements=true&nullCatalogMeansCurrent=true
+          url: jdbc:mysql://192.168.130.153:3306/ruoyi-vue-pro?useSSL=false&serverTimezone=Asia/Shanghai&allowPublicKeyRetrieval=true&rewriteBatchedStatements=true&nullCatalogMeansCurrent=true
          username: root
          password: 123456
 #        tdengine: # IoT 数据库（需要 IoT 物联网再开启噢！）
@ -79,7 +79,7 @@ spring:
  # Redis 配置。Redisson 默认的配置足够使用，一般不需要进行调优
  redis:
-    host: 127.0.0.1 # 地址
+    host: 192.168.130.153 # 地址
    port: 6379 # 端口
    database: 0 # 数据库索引
    password: root # 密码，建议生产环境开启