|
|
|
@ -171,7 +171,7 @@ public class OcrProcessingService { |
|
|
|
|
boolean hasUnifiedSocialCredit = texts.stream().anyMatch(t -> t.contains("统一社会信用代码")); |
|
|
|
|
boolean hasVehicleLicense = texts.stream().anyMatch(t -> t.contains("机动车行驶证")); |
|
|
|
|
boolean hasResidentId = texts.stream().anyMatch(t -> t.contains("居民身份证")); |
|
|
|
|
|
|
|
|
|
boolean hasSpeicalCertificate = texts.stream().anyMatch(t -> t.contains("监督检验证书")); |
|
|
|
|
if (hasIdNumber || hasResidentId) { |
|
|
|
|
return "身份证"; |
|
|
|
|
} |
|
|
|
@ -181,6 +181,9 @@ public class OcrProcessingService { |
|
|
|
|
if (hasVehicleLicense) { |
|
|
|
|
return "行驶证"; |
|
|
|
|
} |
|
|
|
|
if (hasSpeicalCertificate) { |
|
|
|
|
return "监督检验证书"; |
|
|
|
|
} |
|
|
|
|
return "未知类型"; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
@ -201,6 +204,9 @@ public class OcrProcessingService { |
|
|
|
|
case "行驶证": |
|
|
|
|
data.putAll(extractVehicleLicenseInfo(texts)); |
|
|
|
|
break; |
|
|
|
|
case "监督检验证书": |
|
|
|
|
data.putAll(extractSpeicalCertificateInfo(texts)); |
|
|
|
|
break; |
|
|
|
|
default: |
|
|
|
|
|
|
|
|
|
} |
|
|
|
@ -220,10 +226,10 @@ public class OcrProcessingService { |
|
|
|
|
|
|
|
|
|
private Map<String, Object> extractIdCardInfo(List<String> texts) { |
|
|
|
|
Map<String, Object> data = new LinkedHashMap<>(); |
|
|
|
|
extractNameWithValidation(texts, data); |
|
|
|
|
|
|
|
|
|
// 合并多页文本
|
|
|
|
|
String mergedText = String.join("", texts); |
|
|
|
|
|
|
|
|
|
extractNameWithValidation(mergedText, data); |
|
|
|
|
extractIdNumber(mergedText, data); |
|
|
|
|
return data; |
|
|
|
|
} |
|
|
|
@ -274,7 +280,7 @@ public class OcrProcessingService { |
|
|
|
|
private static String preprocess(List<String> texts) { |
|
|
|
|
return String.join("", texts) |
|
|
|
|
// 修复中文与代码粘连(排除代码内部字符)
|
|
|
|
|
.replaceAll("([\u4e00-\u9fa5])(?=[A-Z0-9]{18})", "$1 ") // 中文后接完整代码加空格
|
|
|
|
|
.replaceAll("([\u4e00-\u9fa5])(?=[A-Z0-9]{18})", "$1 ") // 中文后接完整代码加空格0
|
|
|
|
|
.replaceAll("([\u4e00-\u9fa5])([A-Z0-9]{1,17}(?!\\d))", "$1 $2") // 部分粘连处理
|
|
|
|
|
// 保留关键代码段
|
|
|
|
|
.replaceAll("(系统|二维码)(\\d{5})", "$1 $2") // 部分数字处理
|
|
|
|
@ -317,9 +323,7 @@ public class OcrProcessingService { |
|
|
|
|
data.put("住所", address); |
|
|
|
|
return; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// 兜底匹配逻辑(省级定位)
|
|
|
|
|
|
|
|
|
|
for (String province : PROVINCES) { |
|
|
|
|
int start = text.indexOf(province); |
|
|
|
|
if (start != -1) { |
|
|
|
@ -332,9 +336,31 @@ public class OcrProcessingService { |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// 公司名称提取方法
|
|
|
|
|
/** |
|
|
|
|
* 监督检验证书 |
|
|
|
|
* @param texts |
|
|
|
|
* @return |
|
|
|
|
*/ |
|
|
|
|
private Map<String, Object> extractSpeicalCertificateInfo(List<String> texts){ |
|
|
|
|
Map<String, Object> data = new HashMap<>(); |
|
|
|
|
// 提取公司名称
|
|
|
|
|
String mergedText = String.join("|", texts); |
|
|
|
|
extractManufacturerName(mergedText, data); |
|
|
|
|
//extractCode(mergedText, data);
|
|
|
|
|
//extractAddress(mergedText, data);
|
|
|
|
|
return data; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/** |
|
|
|
|
* 提取监督检验证书的制造单位名称 |
|
|
|
|
* @param text |
|
|
|
|
* @param data |
|
|
|
|
*/ |
|
|
|
|
private static void extractManufacturerName(String text, Map<String, Object> data) { |
|
|
|
|
|
|
|
|
|
return ; |
|
|
|
|
} |
|
|
|
|
// 公司名称提取方法
|
|
|
|
|
private Map<String, Object> extractVehicleLicenseInfo(List<String> texts) { |
|
|
|
|
Map<String, Object> data = new HashMap<>(); |
|
|
|
|
|
|
|
|
@ -369,48 +395,15 @@ public class OcrProcessingService { |
|
|
|
|
|
|
|
|
|
return data; |
|
|
|
|
} |
|
|
|
|
private void extractNameWithValidation(List<String> texts, Map<String, Object> data) { |
|
|
|
|
// 方案1:直接匹配模式
|
|
|
|
|
Optional<String> directMatch = texts.stream() |
|
|
|
|
.map(text -> { |
|
|
|
|
Matcher m = NAME_PATTERN.matcher(text); |
|
|
|
|
return m.find() ? m.group(1) : null; |
|
|
|
|
}) |
|
|
|
|
.filter(Objects::nonNull) |
|
|
|
|
.findFirst(); |
|
|
|
|
|
|
|
|
|
if (directMatch.isPresent() && isValidName(directMatch.get())) { |
|
|
|
|
data.put("姓名", directMatch.get()); |
|
|
|
|
return; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// 方案2:邻近行匹配(优化版)
|
|
|
|
|
int nameIndex = findNameKeywordIndex(texts); |
|
|
|
|
if (nameIndex != -1) { |
|
|
|
|
// 检查本行剩余内容
|
|
|
|
|
String currentLine = texts.get(nameIndex).replaceAll("姓名[::]?", ""); |
|
|
|
|
if (isValidName(currentLine)) { |
|
|
|
|
data.put("姓名", currentLine.trim()); |
|
|
|
|
return; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// 优先检查下一行
|
|
|
|
|
if (checkAdjacentLine(texts, nameIndex + 1, data)) { |
|
|
|
|
return; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// 其次检查上一行
|
|
|
|
|
if (checkAdjacentLine(texts, nameIndex - 1, data)) { |
|
|
|
|
return; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// 方案3:全文本扫描+智能验证
|
|
|
|
|
texts.stream() |
|
|
|
|
.flatMap(text -> Arrays.stream(text.split("[\\s\\d]+"))) // 排除含数字的噪声
|
|
|
|
|
.filter(this::isValidName) |
|
|
|
|
.findFirst() |
|
|
|
|
.ifPresent(name -> data.put("姓名", name)); |
|
|
|
|
/** |
|
|
|
|
* 身份证姓名提取 |
|
|
|
|
* @param texts |
|
|
|
|
* @param data |
|
|
|
|
*/ |
|
|
|
|
private void extractNameWithValidation(String texts, Map<String, Object> data) { |
|
|
|
|
data.put("姓名", StrUtil.subBetween(texts, "姓名", "性别")); |
|
|
|
|
return; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// 辅助方法:查找"姓名"关键词位置
|
|
|
|
|