增加配置监督检验报告项

master
tyc 6 months ago
parent 90a7edc4f7
commit a661efa35d
  1. 75
      yudao-server/src/main/java/cn/iocoder/yudao/server/service/IdCardInfoService.java
  2. 338
      yudao-server/src/main/java/cn/iocoder/yudao/server/service/OcrProcessingService.java
  3. 62
      yudao-server/src/main/java/cn/iocoder/yudao/server/service/SpeicalCertificateService.java
  4. 216
      yudao-server/src/main/java/cn/iocoder/yudao/server/service/UnifiedSocialCreditService.java
  5. 167
      yudao-server/src/main/java/cn/iocoder/yudao/server/service/VehicleLicenseService.java

@ -0,0 +1,75 @@
package cn.iocoder.yudao.server.service;
import cn.hutool.core.util.StrUtil;
import org.springframework.stereotype.Service;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@Service
public class IdCardInfoService {
// 身份证号的正则表达式
private static final Pattern ID_PATTERN = Pattern.compile("公民身份号码(\\d{17}[0-9Xx])");
// 地址的正则表达式
private static final Pattern ADDRESS_PATTERN = Pattern.compile("住址(.*?)公民身份号码");
public Map<String, Object> extractIdCardInfo(List<String> texts) {
Map<String, Object> data = new LinkedHashMap<>();
// 合并多页文本
String mergedText = String.join("", texts);
extractNameWithValidation(mergedText, data);
extractIdNumber(mergedText, data);
return data;
}
// 身份证号提取(保持原有逻辑)
private void extractIdNumber(String mergedText, Map<String, Object> data) {
// 匹配身份证号
Matcher idMatcher = ID_PATTERN.matcher(mergedText.replaceAll("\\s", ""));
if (idMatcher.find()) {
data.put("身份证号", idMatcher.group(1).toUpperCase());
}
// 匹配地址
Matcher addressMatcher = ADDRESS_PATTERN.matcher(mergedText.replaceAll("\\s", ""));
if (addressMatcher.find()) {
data.put("住址", addressMatcher.group(1));
}
}
/**
* 身份证姓名提取
* @param texts
* @param data
*/
private void extractNameWithValidation(String texts, Map<String, Object> data) {
// 获取"姓名"和"性别"之间的内容
String nameBetween = StrUtil.subBetween(texts, "姓名", "性别");
if (StrUtil.isBlank(nameBetween)) {
// 情况1:中间为空时取"姓名"前的内容
String nameBefore = StrUtil.subBefore(texts, "姓名", false);
data.put("姓名", nameBefore);
} else {
// 情况2:用正则处理"民族"或"民旅"
String processedName = processEthnicKeyword(nameBetween);
data.put("姓名", processedName);
}
}
// 使用正则表达式匹配关键词
private String processEthnicKeyword(String input) {
// 正则匹配"民族"或"民旅"(兼容错别字)
Pattern pattern = Pattern.compile("民[族旅]");
Matcher matcher = pattern.matcher(input);
if (matcher.find()) {
// 截取关键词之前的内容
return input.substring(0, matcher.start());
}
return input; // 无匹配时返回原内容
}
}

@ -1,6 +1,5 @@
package cn.iocoder.yudao.server.service;
import cn.hutool.core.util.StrUtil;
import cn.hutool.http.HttpRequest;
import cn.hutool.http.HttpResponse;
import cn.hutool.json.JSONUtil;
@ -8,113 +7,27 @@ import cn.iocoder.yudao.framework.common.exception.ErrorCode;
import cn.iocoder.yudao.framework.common.exception.ServiceException;
import cn.iocoder.yudao.server.controller.vo.OcrResVO;
import cn.iocoder.yudao.server.controller.vo.ThirdPartyOcrResult;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
@Service
public class OcrProcessingService {
// 完整百家姓集合(包含504个单姓和复姓)
private static final Set<String> SURNAMES = new LinkedHashSet<>(650);
private static final Pattern NAME_PATTERN = Pattern.compile(
"姓名[::]?\\s*([\\u4e00-\\u9fa5]{2,4})(?=[^\\u4e00-\\u9fa5]|$)"
);
// 身份证号的正则表达式
private static final Pattern ID_PATTERN = Pattern.compile("公民身份号码(\\d{17}[0-9Xx])");
// 地址的正则表达式
private static final Pattern ADDRESS_PATTERN = Pattern.compile("住址(.*?)公民身份号码");
static {
// 初始化姓氏数据
initSurnames();
}
// 省级行政区划全称(包含省、自治区、直辖市、特别行政区)
private static final String[] PROVINCES = {
"北京市", "天津市", "河北省", "山西省", "内蒙古自治区", "辽宁省", "吉林省", "黑龙江省",
"上海市", "江苏省", "浙江省", "安徽省", "福建省", "江西省", "山东省", "河南省",
"湖北省", "湖南省", "广东省", "广西省", "海南省", "重庆市", "四川省", "贵州省",
"云南省", "西藏自治区", "陕西省", "甘肃省", "青海省", "宁夏自治区", "新疆自治区",
"香港特别行政区", "澳门特别行政区"
};
private static final Pattern CODE_PATTERN = Pattern.compile(
"(?:统一社会信用代码|信用代码|注册号)[\\s::]*([A-Z0-9]{18})" + // 关键词匹配
"|(?<![A-Z0-9])([A-Z0-9]{18})(?![A-Z0-9])", // 独立代码匹配
Pattern.CASE_INSENSITIVE
);
// 地址正则(支持OCR容错)
private static final Pattern COM_ADDRESS_PATTERN = Pattern.compile(
"(?:住[所\\u4e00-\\u9fa5]{0,6}所|地址)[\\s::]*" +
"((?:[\\u4e00-\\u9fa5]+[省市县区])" + // 省级定位
"(?:[\\u4e00-\\u9fa5]+[市镇乡村街路])" + // 市级定位
"(?:[\\u4e00-\\u9fa5]*\\d+号?)" + // 门牌号
"(?:[\\u4e00-\\u9fa5]*\\d+室))" + // 室号
"(?=\\s|扫描二维码|成立日期|注册资本|http|$)" // 精准终止符
);
private static void initSurnames() {
// 单姓列表(444个)
SURNAMES.addAll(Arrays.asList(
"王","李","张","刘","陈","杨","黄","赵","周","吴","徐","孙","马","朱","胡","林","郭","何","高","罗",
"郑","梁","谢","宋","唐","许","邓","冯","韩","曹","曾","彭","萧","蔡","潘","田","董","袁","于","余",
"叶","蒋","杜","苏","魏","程","吕","丁","沈","任","姚","卢","傅","钟","姜","崔","谭","廖","范","汪",
"陆","金","石","戴","贾","韦","夏","邱","方","侯","邹","熊","孟","秦","白","江","阎","薛","尹","段",
"雷","黎","史","龙","陶","贺","顾","毛","郝","龚","邵","万","钱","严","赖","覃","洪","武","莫","孔",
"向","常","汤","文","牛","樊","葛","邢","安","齐","易","乔","伍","庞","颜","倪","庄","聂","章","鲁",
"岳","翟","殷","詹","申","欧","耿","关","兰","焦","俞","左","柳","甘","祝","包","宁","尚","符","舒",
"阮","柯","纪","梅","童","凌","毕","季","裴","霍","涂","成","苗","谷","盛","曲","翁","冉","骆","蓝",
"路","游","辛","靳","管","柴","蒙","鲍","华","喻","祁","蒲","房","滕","屈","饶","解","牟","艾","尤",
"阳","时","穆","农","司","古","吉","缪","简","车","项","连","芦","麦","褚","娄","窦","戚","岑","党",
"宫","费","卜","冷","晏","席","卫","米","柏","宗","邬","瞿","商","谈","靳","邰","姬","申","扶","堵",
"冉","宰","雍","郤","璩","桑","桂","濮","牛","寿","通","边","扈","燕","冀","郏","浦","尚","农",
"温","别","庄","晏","柴","瞿","阎","充","慕","连","茹","习","宦","艾","鱼","容","向","戈","庾","暨",
"居","衡","步","都","耿","满","弘","匡","国","文","寇","广","禄","阙","东","欧","殳","沃","利","蔚",
"越","夔","隆","师","巩","厍","聂","晁","勾","敖","融","冷","訾","辛","阚","那","简","饶","空",
"曾","毋","沙","乜","养","鞠","须","丰","巢","关","蒯","相","查","后","荆","红","游","竺","权","逯",
"盖","益","桓","公","万俟","司马","上官","欧阳"
));
// 复姓列表(60个)及前缀处理
SURNAMES.addAll(Arrays.asList(
"欧阳","上官","皇甫","令狐","诸葛","司马","宇文","尉迟","慕容","闾丘",
"公羊","澹台","公冶","宗政","濮阳","申屠","公孙","仲孙","轩辕","鲜于",
"钟离","长孙","端木","拓跋","东郭","呼延","羊舌","万俟","南宫","西门",
"亓官","司寇","颛孙","子车","巫马","壤驷","漆雕","乐正","宰父","谷梁",
"段干","梁丘","东门","公西","微生","公户","公玉","公仪","仲长","叔孙",
"屈突","尔朱","斛斯","轩辕","赫连","长孙"
));
// 添加复姓前缀以增强匹配
SURNAMES.addAll(Arrays.asList(
"欧","上","皇","令","诸","司","宇","尉","慕","闾",
"公","澹","宗","濮","申","孙","仲","轩","鲜","钟",
"长","端","拓","东","呼","羊","万","南","西","亓",
"颛","子","巫","壤","漆","乐","宰","谷","段","梁",
"微","叔","屈","尔","斛","赫"
));
}
// 行驶证正则表达式预编译
private static final Pattern VEHICLE_PATTERN = Pattern.compile(
"(号牌号码|住址|车辆识别代号|发证日期)[::]?\\s*([^\\s]+)"
);
// 公司名称正则(增强版)
private static final Pattern COMPANY_PATTERN = Pattern.compile(
"(?:公司名称|名称|企业名称|称)[\\s::]*" + // 名称标识符
"([\\u4e00-\\u9fa5]{2,}(?:省|市|区|县)?" + // 地域信息
"[\\u4e00-\\u9fa5]+" + // 字号
"(?:科技|发展|物流|商贸)[\\u4e00-\\u9fa5]*" + // 行业特征
"有限公司)" // 组织形式
);
private static final String OCR_SERVICE_URL = "http://192.168.130.192:5000/ocr";
@Autowired
private IdCardInfoService idCardInfoService;
@Autowired
private VehicleLicenseService vehicleLicenseService;
@Autowired
private UnifiedSocialCreditService unifiedSocialCreditService;
@Autowired
private SpeicalCertificateService speicalCertificateService;
public OcrResVO process(String filePath) {
// 调用第三方OCR服务
ThirdPartyOcrResult ocrResult = callOcrService(filePath);
// 解析OCR结果
return parseOcrResults(ocrResult);
}
@ -196,16 +109,16 @@ public class OcrProcessingService {
// 公共基础字段
switch (docType) {
case "身份证":
data.putAll(extractIdCardInfo(texts));
data.putAll(idCardInfoService.extractIdCardInfo(texts));
break;
case "营业执照":
data.putAll(extractBusinessLicenseInfo(texts));
data.putAll(unifiedSocialCreditService.extractBusinessLicenseInfo(texts));
break;
case "行驶证":
data.putAll(extractVehicleLicenseInfo(texts));
data.putAll(vehicleLicenseService.extractVehicleLicenseInfo(texts));
break;
case "监督检验证书":
data.putAll(extractSpeicalCertificateInfo(texts));
data.putAll(speicalCertificateService.extractSpeicalCertificateInfo(texts));
break;
default:
@ -220,232 +133,7 @@ public class OcrProcessingService {
.collect(Collectors.joining("+"))
);
}
return data;
}
private Map<String, Object> extractIdCardInfo(List<String> texts) {
Map<String, Object> data = new LinkedHashMap<>();
// 合并多页文本
String mergedText = String.join("", texts);
extractNameWithValidation(mergedText, data);
extractIdNumber(mergedText, data);
return data;
}
// 身份证号提取(保持原有逻辑)
private void extractIdNumber(String mergedText, Map<String, Object> data) {
// 匹配身份证号
Matcher idMatcher = ID_PATTERN.matcher(mergedText.replaceAll("\\s", ""));
if (idMatcher.find()) {
data.put("身份证号", idMatcher.group(1).toUpperCase());
}
// 匹配地址
Matcher addressMatcher = ADDRESS_PATTERN.matcher(mergedText.replaceAll("\\s", ""));
if (addressMatcher.find()) {
data.put("住址", addressMatcher.group(1));
}
}
private Map<String, Object> extractBusinessLicenseInfo(List<String> texts) {
Map<String, Object> data = new HashMap<>();
// 提取公司名称
String mergedText = preprocess(texts);
extractCompanies(mergedText, data);
extractCode(mergedText, data);
extractAddress(mergedText, data);
return data;
}
private static void extractCompanies(String text, Map<String, Object> data) {
Matcher m = COMPANY_PATTERN.matcher(text);
while (m.find()) {
String company = m.group(1)
.replaceAll("[\\s ]+", "") // 清除全/半角空格
.replaceAll("[()]", ""); // 清除括号
data.put("企业名称", company);
return ;
}
// 兜底匹配:直接查找XX有限公司
Matcher fallback = Pattern.compile(
"([\\u4e00-\\u9fa5]{2,}(?:省|市|区|县)?[\\u4e00-\\u9fa5]+有限公司)"
).matcher(text);
while (fallback.find()) {
data.put("企业名称", fallback.group(1));
}
return ;
}
// 预处理(关键增强)
private static String preprocess(List<String> texts) {
return String.join("", texts)
// 修复中文与代码粘连(排除代码内部字符)
.replaceAll("([\u4e00-\u9fa5])(?=[A-Z0-9]{18})", "$1 ") // 中文后接完整代码加空格0
.replaceAll("([\u4e00-\u9fa5])([A-Z0-9]{1,17}(?!\\d))", "$1 $2") // 部分粘连处理
// 保留关键代码段
.replaceAll("(系统|二维码)(\\d{5})", "$1 $2") // 部分数字处理
.replaceAll("\\s+", " ");
}
// 信用代码提取(调试增强版)
private static void extractCode(String text, Map<String, Object> data) {
System.out.println("DEBUG[预处理文本]:" + text); // 调试输出
Matcher m = CODE_PATTERN.matcher(text);
while (m.find()) {
String code = Optional.ofNullable(m.group(1)).orElse(m.group(2));
System.out.println("DEBUG[匹配候选]:" + code); // 调试输出
if (isValidCode(code)) {
data.put("统一社会信用代码", code);
return;
}
}
System.out.println("DEBUG[未找到有效代码]");
}
// 有效性校验(宽松模式)
private static boolean isValidCode(String code) {
return code != null &&
code.length() == 18 &&
code.matches("^[1959][A-Z0-9]{17}");
}
// 地址提取(结构化匹配)
private static void extractAddress(String text, Map<String, Object> data) {
Matcher m = COM_ADDRESS_PATTERN.matcher(text);
if (m.find()) {
String address = m.group(1)
.replaceAll("\\s+", "")
.replaceAll("扫.*日", "")
.replaceAll("([号路])(\\d)", "$1$2"); // 修复门牌号格式
data.put("住所", address);
return;
}
// 兜底匹配逻辑(省级定位)
for (String province : PROVINCES) {
int start = text.indexOf(province);
if (start != -1) {
String address = text.substring(start, text.indexOf("室", start) + 1)
.replaceAll("扫.*日", "")
.replaceAll("[^\\u4e00-\\u9fa5\\d]", "");
data.put("住所", address);
return;
}
}
}
/**
* 监督检验证书
* @param texts
* @return
*/
private Map<String, Object> extractSpeicalCertificateInfo(List<String> texts){
Map<String, Object> data = new HashMap<>();
// 提取公司名称
String mergedText = String.join("|", texts);
extractManufacturerName(mergedText, data);
//extractCode(mergedText, data);
//extractAddress(mergedText, data);
return data;
}
/**
* 提取监督检验证书的制造单位名称
* @param text
* @param data
*/
private static void extractManufacturerName(String text, Map<String, Object> data) {
return ;
}
// 公司名称提取方法
private Map<String, Object> extractVehicleLicenseInfo(List<String> texts) {
Map<String, Object> data = new HashMap<>();
texts.forEach(text -> {
Matcher matcher = VEHICLE_PATTERN.matcher(text);
if (matcher.find()) {
String key = matcher.group(1);
String value = matcher.group(2);
switch (key) {
case "号牌号码":
data.put("号牌号码", value.replaceAll("[^\\u4e00-\\u9fa5A-Z0-9]", ""));
break;
case "住址":
data.put("住址", value);
break;
case "车辆识别代号":
data.put("车辆识别代号", value.toUpperCase());
break;
case "发证日期":
if (value.matches("\\d{4}-\\d{2}-\\d{2}")) {
data.put("发证日期", value);
}
break;
default:
break;
}
}
});
return data;
}
/**
* 身份证姓名提取
* @param texts
* @param data
*/
private void extractNameWithValidation(String texts, Map<String, Object> data) {
data.put("姓名", StrUtil.subBetween(texts, "姓名", "性别"));
return;
}
// 辅助方法:查找"姓名"关键词位置
private int findNameKeywordIndex(List<String> texts) {
return IntStream.range(0, texts.size())
.filter(i -> texts.get(i).contains("姓名"))
.findFirst()
.orElse(-1);
}
// 辅助方法:检查相邻行
private boolean checkAdjacentLine(List<String> texts, int index, Map<String, Object> data) {
if (index >= 0 && index < texts.size()) {
String line = texts.get(index).trim();
if (isValidName(line)) {
data.put("姓名", line);
return true;
}
}
return false;
}
// 增强版姓名验证(包含复姓处理)
private boolean isValidName(String candidate) {
// 基础验证
if (candidate == null || candidate.isEmpty()) {
return false;
}
String cleaned = candidate.replaceAll("[^\\u4e00-\\u9fa5]", "");
if (cleaned.length() < 2 || cleaned.length() > 4) {
return false;
}
// 复姓优先验证(2-3字)
for (int len = Math.min(3, cleaned.length()); len >= 2; len--) {
String prefix = cleaned.substring(0, len);
if (SURNAMES.contains(prefix)) {
return true;
}
}
// 单姓验证
return SURNAMES.contains(cleaned.substring(0, 1));
}
}

@ -0,0 +1,62 @@
package cn.iocoder.yudao.server.service;
import org.springframework.stereotype.Service;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@Service
public class SpeicalCertificateService {
/**
* 监督检验证书
* @param texts
* @return
*/
public Map<String, Object> extractSpeicalCertificateInfo(List<String> texts){
Map<String, Object> data = new HashMap<>();
ArrayList<String> arrayList = new ArrayList<>();
extractManufacturerName(texts, data);
getSpeicalCertificateName(texts, data);
extractManufacturerTime(texts, data);
return data;
}
/**
* 提取监督检验证书的制造单位名称
* @param text
* @param data
*/
private static void extractManufacturerName(List<String> texts, Map<String, Object> data) {
boolean hasCCTZ = texts.stream().anyMatch(t -> t.contains("长春致远新能源装备股份有限公司"));
if (hasCCTZ) {
data.put("制造单位名称","长春致远新能源装备股份有限公司");
}
}
private static void getSpeicalCertificateName(List<String> texts, Map<String, Object> data) {
boolean hasCCTZ = texts.stream().anyMatch(t -> t.contains("长春特种设备检测研究院"));
if (hasCCTZ) {
data.put("监督检验机构名称","长春特种设备检测研究院");
}
}
private static void extractManufacturerTime(List<String> texts, Map<String, Object> data) {
boolean hasCCTZ = texts.stream().anyMatch(t -> t.contains("长春特种设备检测研究院"));
if (hasCCTZ) {
for (int i = 0; i < texts.size(); i++) {
String current = texts.get(i);
// 提取产品批号
if ("产品批号".equals(current) && i + 2 < texts.size()) {
data.put("产品批号",texts.get(i + 2));
}
// 提取制造日期
if ("制造日期".equals(current) && i + 1 < texts.size()) {
data.put("制造日期",texts.get(i + 1));
}
}
}
}
}

@ -0,0 +1,216 @@
package cn.iocoder.yudao.server.service;
import org.springframework.stereotype.Service;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@Service
public class UnifiedSocialCreditService {
// 完整百家姓集合(包含504个单姓和复姓)
private static final Set<String> SURNAMES = new LinkedHashSet<>(650);
static {
// 初始化姓氏数据
initSurnames();
}
// 省级行政区划全称(包含省、自治区、直辖市、特别行政区)
private static final String[] PROVINCES = {
"北京市", "天津市", "河北省", "山西省", "内蒙古自治区", "辽宁省", "吉林省", "黑龙江省",
"上海市", "江苏省", "浙江省", "安徽省", "福建省", "江西省", "山东省", "河南省",
"湖北省", "湖南省", "广东省", "广西省", "海南省", "重庆市", "四川省", "贵州省",
"云南省", "西藏自治区", "陕西省", "甘肃省", "青海省", "宁夏自治区", "新疆自治区",
"香港特别行政区", "澳门特别行政区"
};
private static final Pattern CODE_PATTERN = Pattern.compile(
"(?:统一社会信用代码|信用代码|注册号)[\\s::]*([A-Z0-9]{18})" + // 关键词匹配
"|(?<![A-Z0-9])([A-Z0-9]{18})(?![A-Z0-9])", // 独立代码匹配
Pattern.CASE_INSENSITIVE
);
// 地址正则(支持OCR容错)
private static final Pattern COM_ADDRESS_PATTERN = Pattern.compile(
"(?:住[所\\u4e00-\\u9fa5]{0,6}所|地址)[\\s::]*" +
"((?:[\\u4e00-\\u9fa5]+[省市县区])" + // 省级定位
"(?:[\\u4e00-\\u9fa5]+[市镇乡村街路])" + // 市级定位
"(?:[\\u4e00-\\u9fa5]*\\d+号?)" + // 门牌号
"(?:[\\u4e00-\\u9fa5]*\\d+室))" + // 室号
"(?=\\s|扫描二维码|成立日期|注册资本|http|$)" // 精准终止符
);
private static void initSurnames() {
// 单姓列表(444个)
SURNAMES.addAll(Arrays.asList(
"王","李","张","刘","陈","杨","黄","赵","周","吴","徐","孙","马","朱","胡","林","郭","何","高","罗",
"郑","梁","谢","宋","唐","许","邓","冯","韩","曹","曾","彭","萧","蔡","潘","田","董","袁","于","余",
"叶","蒋","杜","苏","魏","程","吕","丁","沈","任","姚","卢","傅","钟","姜","崔","谭","廖","范","汪",
"陆","金","石","戴","贾","韦","夏","邱","方","侯","邹","熊","孟","秦","白","江","阎","薛","尹","段",
"雷","黎","史","龙","陶","贺","顾","毛","郝","龚","邵","万","钱","严","赖","覃","洪","武","莫","孔",
"向","常","汤","文","牛","樊","葛","邢","安","齐","易","乔","伍","庞","颜","倪","庄","聂","章","鲁",
"岳","翟","殷","詹","申","欧","耿","关","兰","焦","俞","左","柳","甘","祝","包","宁","尚","符","舒",
"阮","柯","纪","梅","童","凌","毕","季","裴","霍","涂","成","苗","谷","盛","曲","翁","冉","骆","蓝",
"路","游","辛","靳","管","柴","蒙","鲍","华","喻","祁","蒲","房","滕","屈","饶","解","牟","艾","尤",
"阳","时","穆","农","司","古","吉","缪","简","车","项","连","芦","麦","褚","娄","窦","戚","岑","党",
"宫","费","卜","冷","晏","席","卫","米","柏","宗","邬","瞿","商","谈","靳","邰","姬","申","扶","堵",
"冉","宰","雍","郤","璩","桑","桂","濮","牛","寿","通","边","扈","燕","冀","郏","浦","尚","农",
"温","别","庄","晏","柴","瞿","阎","充","慕","连","茹","习","宦","艾","鱼","容","向","戈","庾","暨",
"居","衡","步","都","耿","满","弘","匡","国","文","寇","广","禄","阙","东","欧","殳","沃","利","蔚",
"越","夔","隆","师","巩","厍","聂","晁","勾","敖","融","冷","訾","辛","阚","那","简","饶","空",
"曾","毋","沙","乜","养","鞠","须","丰","巢","关","蒯","相","查","后","荆","红","游","竺","权","逯",
"盖","益","桓","公","万俟","司马","上官","欧阳"
));
// 复姓列表(60个)及前缀处理
SURNAMES.addAll(Arrays.asList(
"欧阳","上官","皇甫","令狐","诸葛","司马","宇文","尉迟","慕容","闾丘",
"公羊","澹台","公冶","宗政","濮阳","申屠","公孙","仲孙","轩辕","鲜于",
"钟离","长孙","端木","拓跋","东郭","呼延","羊舌","万俟","南宫","西门",
"亓官","司寇","颛孙","子车","巫马","壤驷","漆雕","乐正","宰父","谷梁",
"段干","梁丘","东门","公西","微生","公户","公玉","公仪","仲长","叔孙",
"屈突","尔朱","斛斯","轩辕","赫连","长孙"
));
// 添加复姓前缀以增强匹配
SURNAMES.addAll(Arrays.asList(
"欧","上","皇","令","诸","司","宇","尉","慕","闾",
"公","澹","宗","濮","申","孙","仲","轩","鲜","钟",
"长","端","拓","东","呼","羊","万","南","西","亓",
"颛","子","巫","壤","漆","乐","宰","谷","段","梁",
"微","叔","屈","尔","斛","赫"
));
}
// 公司名称正则(增强版)
private static final Pattern COMPANY_PATTERN = Pattern.compile(
"(?:公司名称|名称|企业名称|称)[\\s::]*" + // 名称标识符
"([\\u4e00-\\u9fa5]{2,}(?:省|市|区|县)?" + // 地域信息
"[\\u4e00-\\u9fa5]+" + // 字号
"(?:科技|发展|物流|商贸)[\\u4e00-\\u9fa5]*" + // 行业特征
"有限公司)" // 组织形式
);
public Map<String, Object> extractBusinessLicenseInfo(List<String> texts) {
Map<String, Object> data = new HashMap<>();
// 提取公司名称
String mergedText = preprocess(texts);
extractCompanies(mergedText, data);
extractCode(mergedText, data);
extractAddress(mergedText, data);
return data;
}
private static void extractCompanies(String text, Map<String, Object> data) {
Matcher m = COMPANY_PATTERN.matcher(text);
while (m.find()) {
String company = m.group(1)
.replaceAll("[\\s ]+", "") // 清除全/半角空格
.replaceAll("[()]", ""); // 清除括号
data.put("企业名称", company);
return ;
}
// 兜底匹配:直接查找XX有限公司
Matcher fallback = Pattern.compile(
"([\\u4e00-\\u9fa5]{2,}(?:省|市|区|县)?[\\u4e00-\\u9fa5]+有限公司)"
).matcher(text);
while (fallback.find()) {
data.put("企业名称", fallback.group(1));
}
return ;
}
// 预处理(关键增强)
private static String preprocess(List<String> texts) {
return String.join("", texts)
// 修复中文与代码粘连(排除代码内部字符)
.replaceAll("([\u4e00-\u9fa5])(?=[A-Z0-9]{18})", "$1 ") // 中文后接完整代码加空格0
.replaceAll("([\u4e00-\u9fa5])([A-Z0-9]{1,17}(?!\\d))", "$1 $2") // 部分粘连处理
// 保留关键代码段
.replaceAll("(系统|二维码)(\\d{5})", "$1 $2") // 部分数字处理
.replaceAll("\\s+", " ");
}
// 信用代码提取(调试增强版)
private static void extractCode(String text, Map<String, Object> data) {
System.out.println("DEBUG[预处理文本]:" + text); // 调试输出
Matcher m = CODE_PATTERN.matcher(text);
while (m.find()) {
String code = Optional.ofNullable(m.group(1)).orElse(m.group(2));
System.out.println("DEBUG[匹配候选]:" + code); // 调试输出
if (isValidCode(code)) {
data.put("统一社会信用代码", code);
return;
}
}
System.out.println("DEBUG[未找到有效代码]");
}
// 有效性校验(宽松模式)
private static boolean isValidCode(String code) {
return code != null &&
code.length() == 18 &&
code.matches("^[1959][A-Z0-9]{17}");
}
// 地址提取(结构化匹配)
private static void extractAddress(String text, Map<String, Object> data) {
Matcher m = COM_ADDRESS_PATTERN.matcher(text);
if (m.find()) {
String address = m.group(1)
.replaceAll("\\s+", "")
.replaceAll("扫.*日", "")
.replaceAll("([号路])(\\d)", "$1$2"); // 修复门牌号格式
data.put("住所", address);
return;
}
// 兜底匹配逻辑(省级定位)
for (String province : PROVINCES) {
int start = text.indexOf(province);
if (start != -1) {
String address = text.substring(start, text.indexOf("室", start) + 1)
.replaceAll("扫.*日", "")
.replaceAll("[^\\u4e00-\\u9fa5\\d]", "");
data.put("住所", address);
return;
}
}
}
// 辅助方法:检查相邻行
private boolean checkAdjacentLine(List<String> texts, int index, Map<String, Object> data) {
if (index >= 0 && index < texts.size()) {
String line = texts.get(index).trim();
if (isValidName(line)) {
data.put("姓名", line);
return true;
}
}
return false;
}
// 增强版姓名验证(包含复姓处理)
private boolean isValidName(String candidate) {
// 基础验证
if (candidate == null || candidate.isEmpty()) {
return false;
}
String cleaned = candidate.replaceAll("[^\\u4e00-\\u9fa5]", "");
if (cleaned.length() < 2 || cleaned.length() > 4) {
return false;
}
// 复姓优先验证(2-3字)
for (int len = Math.min(3, cleaned.length()); len >= 2; len--) {
String prefix = cleaned.substring(0, len);
if (SURNAMES.contains(prefix)) {
return true;
}
}
// 单姓验证
return SURNAMES.contains(cleaned.substring(0, 1));
}
}

@ -0,0 +1,167 @@
package cn.iocoder.yudao.server.service;
import cn.hutool.core.util.ReUtil;
import cn.hutool.core.util.StrUtil;
import org.springframework.stereotype.Service;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 行驶证识别类
*/
@Service
public class VehicleLicenseService {
public static void main(String[] args) {
String input = "中华人民共和国机动车行驶证Veltele Lleenseor tPeopiesRepubhlear China号牌导码号牌号码_冀A4647W鄂A4647W档案编号130111431982PlateNo车辆类型Yeeer-Tyie重型半挂牵引车所有人核定较人数2人石家庄畅宇汽车运输有限公司总质量住址河北省石家压市泉区上江镇韩庄村永来街香巷3号整质8870kg核定载所证Adurexs使用性质外尺寸.7400×2550×3560mm准本H2频量48000kg货运昂牌型号Midet解放牌CA4250P66M25T1E6备强制报废期止:2039-11-01河北省石家车辆调润代号NIALFWSRX9L8RIF18215庄市公安局发动机号码ON54120098检验有效期至2025年11月冀A交通管理局注册扫期检验记录RepeierDus2024-11-01发证日期2024-11-91天然气lsueDate300051111965";
// 正则表达式匹配“代号”后以L开头的17位字母数字组合
Pattern pattern = Pattern.compile("代号.*?(L[A-Za-z0-9]{16})");
Matcher matcher = pattern.matcher(input);
if (matcher.find()) {
String code = matcher.group(1);
System.out.println("匹配到的代号: " + code); // 输出:LFWSRX9L8RIF18215
} else {
System.out.println("未找到符合要求的代号");
}
}
public Map<String, Object> extractVehicleLicenseInfo(List<String> texts) {
Map<String, Object> data = new HashMap<>();
String text = String.join("", texts)
.replaceAll("2925-", "2025-")
.replaceAll("\\.", "");
data.put("号牌号码", extractPlateNo(text)); // 冀A4336E
data.put("车辆识别代码", extractVin(text)); // LFWSRX9L2RIF17688
data.put("住址", extractAddress(text)); // 河北省石家庄市鹿泉区上庄镇韩庄村永乐街芳香巷3号
data.put("发证日期", extractIssueDate(text)); // 2024-11-01
return data;
}
private static String extractVin(String text) {
// 正则表达式直接匹配以 L 开头的 17 位字符
Pattern pattern = Pattern.compile("代号.*?(L[A-Za-z0-9]{16})");
Matcher matcher = pattern.matcher(text);
if (matcher.find()) {
String code = matcher.group(1);
System.out.println("匹配到的代号: " + code); // 输出:LFWSRX9L8RIF18215
return code;
} else {
System.out.println("未找到符合要求的代号");
}
return null;
}
private static String extractPlateNo(String text) {
// 综合正则表达式(覆盖所有常见车牌类型)
String regex =
"(?<![0-9A-Z])" // 前导边界控制
+ "(["
+ "京津沪渝冀晋辽吉黑苏浙皖闽赣鲁豫鄂湘粤琼川贵云陕甘青台蒙藏桂宁新港澳使领学警" // 省份简称白名单
+ "])"
+ "("
+ "([A-HJ-NP-ZDF](?:·?[0-9A-HJ-NP-Z]{5,6})" // 新能源/普通车牌
+ "|([A-HJ-NP-Z][0-9A-HJ-NP-Z]{4}[挂学警港澳])" // 普通车牌
+ "|(Z·[0-9A-HJ-NP-Z]{4,5})" // 港澳车牌
+ "|([使领]\\d{6})" // 使馆车牌
+ ")"
+ "(?![0-9A-Z]))"; // 后续边界控制
// 执行匹配
Matcher matcher = Pattern.compile(regex).matcher(text);
if (matcher.find()) {
// 清洗分隔符并返回
return matcher.group().replaceAll("[·\\s]", "");
}
return null;
}
private static String extractAddress(String text) {
// 精确地址提取(到门牌号)
return ReUtil.getGroup1(
"住址([\u4e00-\u9fa5]+?省[\u4e00-\u9fa5]+?市.*?\\d+号)",
text);
}
private static String extractIssueDate(String text) {
// 主正则:严格匹配标准格式
String primaryRegex = "(?:发证日期|IsueDate|ReastrDite)[^\\d]{0,5}(20\\d{2}[-=年]?\\d{1,2}[-=月]?\\d{1,2})";
// 备用正则:精确匹配紧凑格式
String fallbackRegex = "(?i)(?:发证日期|IsueDate|ReastrDite)[^\\d]{0,5}(20\\d{2})-?(0[1-9]|1[0-2])(0[1-9]|[12]\\d|3[01])";
// 优先尝试主正则匹配
String rawDate = ReUtil.get(primaryRegex, text, 1);
// 主正则匹配失败时尝试备用正则
if (StrUtil.isBlank(rawDate)) {
Matcher fallbackMatcher = Pattern.compile(fallbackRegex).matcher(text);
if (fallbackMatcher.find()) {
rawDate = String.format("%s-%s-%s",
fallbackMatcher.group(1),
fallbackMatcher.group(2),
fallbackMatcher.group(3));
}
}
// 兜底方案:精准倒序扫描
if (StrUtil.isBlank(rawDate)) {
// 增强型正则(匹配日期核心部分)
Pattern fallbackPattern = Pattern.compile(
"(20\\d{2}[-/年.]?\\d{2}[-/月.]?\\d{2})|(20\\d{6})"
);
// 获取所有候选并倒序筛选
List<String> candidates = ReUtil.findAll(fallbackPattern, text, 0);
for (int i = candidates.size()-1; i >=0 ; i--) {
String candidate = candidates.get(i);
// 清洗并验证日期有效性
String cleaned = candidate.replaceAll("[^0-9]", "");
if (cleaned.length() != 8) {
continue;
}
try {
int year = Integer.parseInt(cleaned.substring(0,4));
int month = Integer.parseInt(cleaned.substring(4,6));
int day = Integer.parseInt(cleaned.substring(6,8));
if (year < 2000 || year > 2099) {
continue;
}
if (month < 1 || month > 12) {
continue;
}
if (day < 1 || day > 31) {
continue;
}
// 找到第一个有效日期立即返回
rawDate = String.format("%04d-%02d-%02d", year, month, day);
break;
} catch (Exception e) {
continue;
}
}
}
// 最终校验
if (StrUtil.isNotBlank(rawDate)) {
// 排除闰年等复杂校验(按业务需求可扩展)
return rawDate;
}
return null;
}
}
Loading…
Cancel
Save