WdcApplicationTests.java 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714
  1. package cn.cslg.wdc;
  2. import cn.cslg.wdc.common.exception.XiaoShiException;
  3. import cn.cslg.wdc.dto.SelectCaseInfoDTO;
  4. import cn.cslg.wdc.dto.common.SectionDiffCommandVisitor;
  5. import cn.cslg.wdc.entity.AssoCaseFile;
  6. import cn.cslg.wdc.entity.CaseFile;
  7. import cn.cslg.wdc.entity.Discrepancy;
  8. import cn.cslg.wdc.mapper.AssoCaseFileMapper;
  9. import cn.cslg.wdc.mapper.CaseFileMapper;
  10. import cn.cslg.wdc.service.CaseFileService;
  11. import cn.cslg.wdc.service.DiscrepancyService;
  12. import cn.cslg.wdc.service.common.CosineSimilarityService;
  13. import cn.cslg.wdc.service.common.FileManagerService;
  14. import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
  15. import org.apache.commons.compress.utils.IOUtils;
  16. import org.apache.commons.lang3.ObjectUtils;
  17. import org.apache.commons.lang3.StringUtils;
  18. import org.apache.commons.text.diff.EditScript;
  19. import org.apache.commons.text.diff.StringsComparator;
  20. import org.apache.poi.hwpf.extractor.WordExtractor;
  21. import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
  22. import org.apache.poi.xwpf.usermodel.XWPFDocument;
  23. import org.apache.poi.xwpf.usermodel.XWPFParagraph;
  24. import org.apache.poi.xwpf.usermodel.XWPFRun;
  25. import org.checkerframework.checker.units.qual.A;
  26. import org.jsoup.Jsoup;
  27. import org.jsoup.nodes.Document;
  28. import org.jsoup.nodes.Element;
  29. import org.jsoup.nodes.Node;
  30. import org.jsoup.nodes.TextNode;
  31. import org.jsoup.select.Elements;
  32. import org.junit.jupiter.api.Test;
  33. import org.springframework.beans.factory.annotation.Autowired;
  34. import org.springframework.boot.test.context.SpringBootTest;
  35. import org.springframework.util.CollectionUtils;
  36. import java.io.*;
  37. import java.math.BigDecimal;
  38. import java.math.RoundingMode;
  39. import java.util.*;
  40. @SpringBootTest
  41. class WdcApplicationTests {
  42. @Autowired
  43. private AssoCaseFileMapper assoCaseFileMapper;
  44. @Autowired
  45. private CaseFileMapper caseFileMapper;
  46. @Autowired
  47. private DiscrepancyService discrepancyService;
  48. @Autowired
  49. private CosineSimilarityService cosineSimilarityService;
  50. @Autowired
  51. private FileManagerService fileManagerService;
  52. @Autowired
  53. private CaseFileService caseFileService;
  54. @Test
  55. void contextLoads() throws IOException {
  56. String s1 = "46283d7fdae3413491da50dfd3b92364";
  57. String s2 = "f8d77ae1ce2f4d298064ecaecf5ff301";
  58. Integer s = 8;
  59. String s3 = "241023-P20241102-PACN2417839-翻译方法、翻译装置、电子设备以及计算机可读存储介质-专利申请文件-v1F.docx";
  60. String s4 = "241024-S2435631-测试卷-压缩包-语音翻译方法及装置、电子设备以及计算机可读存储介质-新申请文档-定稿.docx";
  61. caseFileService.getDiscrepancyByFile(s1, s2, s, s3, s4);
  62. System.out.println("AAAAAA");
  63. // final Long count = discrepancyService.getBaseMapper().selectCount(new LambdaQueryWrapper<Discrepancy>());
  64. // System.out.println(count);
  65. // Discrepancy discrepancy = new Discrepancy();
  66. // discrepancy.setCaseNo("sajdsak");
  67. // discrepancy.setFirstDiscrepancy("0.36");
  68. // discrepancy.setSecondDiscrepancy("0.66");
  69. // discrepancy.insert();
  70. }
  71. private static void processNode(XWPFParagraph paragraph, Node node) {
  72. XWPFRun run = paragraph.createRun();
  73. if (node instanceof TextNode) {
  74. run.setText(((TextNode) node).text());
  75. } else if (node instanceof Element) {
  76. Element element = (Element) node;
  77. String tagName = element.tagName();
  78. if ("em".equalsIgnoreCase(tagName)) {
  79. run.setText(element.text());
  80. run.setItalic(true);
  81. } else if ("del".equalsIgnoreCase(tagName)) {
  82. run.setText(element.text());
  83. run.setStrike(true);
  84. } else {
  85. // 递归处理其他标签(如果有)
  86. for (Node childNode : element.childNodes()) {
  87. processNode(paragraph, childNode);
  88. }
  89. }
  90. }
  91. }
  92. @Test
  93. public void test1() throws Exception {
  94. String path = "F:\\file\\测试\\word对比\\CCC.docx";
  95. XWPFDocument document = new XWPFDocument(new FileInputStream(path));
  96. XWPFWordExtractor extractor = new XWPFWordExtractor(document);
  97. String text = extractor.getText();
  98. System.out.println(text);
  99. document.close();
  100. Document doc = Jsoup.parse(text);
  101. Elements paragraphs = doc.select("p");
  102. }
  103. @Test
  104. public void test117() {
  105. // String path = "F:\\file\\测试\\word对比\\S2418519-测试卷-PACN2414802-一种用于降低GIDL的混合型Gate结构及制备-权利要求书-v1r01-sq.docx";
  106. // String path1 = "F:\\file\\测试\\word对比\\240722-S2418519-测试卷-PACN2414802-一种用于降低GIDL的混合型Gate结构及制备-申请全文-v1F.docx";
  107. // String path = "F:\\file\\测试\\word对比\\240722-S2418519-测试卷-PACN2414802-一种用于降低GIDL的混合型Gate结构及制备-申请全文-v1F.docx";
  108. // String path1 = "F:\\file\\测试\\word对比\\240805-S2418519-测试卷-PACN2414802-一种用于降低GIDL的混合型Gate结构及制备-申请全文-v3F-清洁版.docx";
  109. String path = "F:\\file\\测试\\word对比\\AAA.docx";
  110. String path1 = "F:\\file\\测试\\word对比\\BBBB.docx";
  111. if (!path.endsWith(".doc") && !path.endsWith(".docx")) {
  112. throw new XiaoShiException("请上传Word文件");
  113. }
  114. if (!path1.endsWith(".doc") && !path1.endsWith(".docx")) {
  115. throw new XiaoShiException("请上传Word文件");
  116. }
  117. try {
  118. String text = "";
  119. String text1 = "";
  120. if (path.endsWith(".docx")) {
  121. XWPFDocument document = new XWPFDocument(new FileInputStream(path));
  122. XWPFWordExtractor extractor = new XWPFWordExtractor(document);
  123. text = extractor.getText();
  124. // System.out.println(text);
  125. document.close();
  126. } else if (path.endsWith(".doc")) {
  127. InputStream inputStream = new FileInputStream(path);
  128. WordExtractor wordExtractor = new WordExtractor(inputStream);
  129. text = wordExtractor.getText();
  130. }
  131. if (path1.endsWith(".docx")) {
  132. XWPFDocument document1 = new XWPFDocument(new FileInputStream(path1));
  133. // List<XWPFParagraph> xwpfParagraphs= document1.getParagraphs();
  134. // xwpfParagraphs.forEach(item->{
  135. // System.out.println(item.getText());
  136. // });
  137. XWPFWordExtractor extractor1 = new XWPFWordExtractor(document1);
  138. text1 = extractor1.getText();
  139. // System.out.println(text1);
  140. // 关闭流
  141. document1.close();
  142. } else if (path1.endsWith(".doc")) {
  143. InputStream inputStream = new FileInputStream(path1);
  144. WordExtractor wordExtractor = new WordExtractor(inputStream);
  145. text1 = wordExtractor.getText();
  146. }
  147. compareText(text, text1);
  148. } catch (Exception e) {
  149. e.printStackTrace();
  150. }
  151. }
  152. public static void compareText(String text, String text1) {
  153. // commons-text
  154. StringsComparator comparator = new StringsComparator(text, text1);
  155. EditScript<Character> script = comparator.getScript();
  156. SectionDiffCommandVisitor commandVisitor = new SectionDiffCommandVisitor();
  157. script.visit(commandVisitor);
  158. commandVisitor.finish();
  159. // System.out.println(commandVisitor.getLeftTemp());
  160. // System.out.println(commandVisitor.getRightTemp());
  161. String firstDoc = commandVisitor.getLeftTemp().toString();
  162. int firstDocLen = firstDoc.length();
  163. System.out.println("firstDoc:" + firstDocLen);
  164. String secondDoc = commandVisitor.getRightTemp().toString();
  165. String secondDocReplace = secondDoc.replace("\n", "<br>");
  166. String html = "<p>" + secondDocReplace + "</p>";
  167. // System.out.println(html);
  168. // 使用Jsoup解析HTML
  169. Document doc = Jsoup.parse(html);
  170. Elements paragraphs = doc.select("p");
  171. // 创建Word文档
  172. XWPFDocument wordDocument = new XWPFDocument();
  173. // 遍历每个段落
  174. for (Element paragraph : paragraphs) {
  175. XWPFParagraph wordParagraph = wordDocument.createParagraph();
  176. XWPFRun run = wordParagraph.createRun();
  177. int emNum = 0;
  178. int delNum = 0;
  179. int total = 0;
  180. // 遍历段落中的每个节点
  181. for (Node node : paragraph.childNodes()) {
  182. if (node instanceof TextNode) {
  183. try {
  184. boolean b = run.getText(0).isEmpty();
  185. // System.out.println(run.getText(0));
  186. if (!b) {
  187. if (org.apache.commons.lang3.StringUtils.isNotEmpty(run.getText(0))) {
  188. run = wordParagraph.createRun();
  189. }
  190. }
  191. } catch (Exception e) {
  192. }
  193. // 处理纯文本节点
  194. run.setText(((TextNode) node).text());
  195. total += ((TextNode) node).text().length();
  196. } else if (node instanceof Element) {
  197. // 处理HTML元素
  198. Element element = (Element) node;
  199. if ("em".equalsIgnoreCase(element.tagName())) {
  200. String s = element.text();
  201. // 应用斜体样式
  202. run = wordParagraph.createRun();
  203. run.setText(s);
  204. emNum += s.length();
  205. run.setColor("FF0000");
  206. run.setBold(true);
  207. run.setItalic(true);
  208. } else if ("del".equalsIgnoreCase(element.tagName())) {
  209. String s = element.text();
  210. // 应用删除线样式
  211. run = wordParagraph.createRun();
  212. run.setText(s);
  213. delNum += s.length();
  214. run.setColor("0000FF");
  215. run.setBold(true);
  216. run.setStrike(true);
  217. } else if ("br".equalsIgnoreCase(element.tagName())) {
  218. run = wordParagraph.createRun();
  219. run.addBreak();
  220. } else {
  221. // 递归处理其他标签(如果有)
  222. for (Node childNode : element.childNodes()) {
  223. processNode(wordParagraph, childNode);
  224. }
  225. }
  226. }
  227. }
  228. System.out.println("Em:" + emNum);
  229. System.out.println("Del:" + delNum);
  230. System.out.println("Total:" + total);
  231. int sum = emNum + delNum;
  232. final BigDecimal sumBig = new BigDecimal(sum);
  233. final BigDecimal totalBig = new BigDecimal(total);
  234. BigDecimal diff = sumBig.divide(totalBig, 2, RoundingMode.HALF_UP)
  235. .multiply(new BigDecimal(100));
  236. System.out.println("Diff:" + diff + "%");
  237. }
  238. // 将文档写入文件
  239. try (FileOutputStream out = new FileOutputStream("F:\\file\\测试\\word对比\\CCC.docx")) {
  240. wordDocument.write(out);
  241. } catch (IOException e) {
  242. e.printStackTrace();
  243. }
  244. // 关闭文档
  245. try {
  246. wordDocument.close();
  247. } catch (IOException e) {
  248. e.printStackTrace();
  249. }
  250. }
  251. public void addDiff(String caseNo) throws IOException {
  252. CaseFile caseFile = caseFileMapper.selectOne(new LambdaQueryWrapper<CaseFile>()
  253. .eq(CaseFile::getCaseNo, caseNo));
  254. if (ObjectUtils.isNotEmpty(caseFile)) {
  255. List<AssoCaseFile> assoCaseFiles = assoCaseFileMapper.selectList(new LambdaQueryWrapper<AssoCaseFile>()
  256. .eq(AssoCaseFile::getCaseId, caseFile.getId()));
  257. AssoCaseFile caseFile1 = assoCaseFiles.stream().filter(i -> i.getFileType() == 1).findFirst().orElse(new AssoCaseFile());
  258. AssoCaseFile caseFile2 = assoCaseFiles.stream().filter(i -> i.getFileType() == 3).findFirst().orElse(new AssoCaseFile());
  259. AssoCaseFile caseFile3 = assoCaseFiles.stream().filter(i -> i.getFileType() == 3).findFirst().orElse(new AssoCaseFile());
  260. if (ObjectUtils.isNotEmpty(caseFile1) && ObjectUtils.isNotEmpty(caseFile2)) {
  261. Discrepancy discrepancy = new Discrepancy();
  262. discrepancy.setCaseId(caseFile.getId());
  263. discrepancy.setCaseFileId1(caseFile1.getId());
  264. discrepancy.setCaseFileId2(caseFile2.getId());
  265. discrepancy.setDiscrepancyType(1);
  266. discrepancy.insert();
  267. this.getFile(caseFile1.getFileGuid(), caseFile2.getFileGuid(), discrepancy.getId());
  268. }
  269. if (ObjectUtils.isNotEmpty(caseFile2) && ObjectUtils.isNotEmpty(caseFile3)) {
  270. Discrepancy discrepancy = new Discrepancy();
  271. discrepancy.setCaseId(caseFile.getId());
  272. discrepancy.setCaseFileId1(caseFile2.getId());
  273. discrepancy.setCaseFileId2(caseFile3.getId());
  274. discrepancy.setDiscrepancyType(2);
  275. discrepancy.insert();
  276. }
  277. }
  278. }
  279. public void getFile(String guid1, String guid2, Integer discrepancyId) throws IOException {
  280. byte[] bytes = fileManagerService.downloadSystemFileFromFMS(guid1);
  281. File tempFile = File.createTempFile("temp1_", ".docx");
  282. try (
  283. InputStream inputStream = new ByteArrayInputStream(bytes);
  284. FileOutputStream outputStream = new FileOutputStream(tempFile)
  285. ) {
  286. IOUtils.copy(inputStream, outputStream);
  287. }
  288. byte[] bytes1 = fileManagerService.downloadSystemFileFromFMS(guid2);
  289. File tempFile1 = File.createTempFile("temp2_", ".docx");
  290. try (
  291. InputStream inputStream = new ByteArrayInputStream(bytes1);
  292. FileOutputStream outputStream = new FileOutputStream(tempFile1)
  293. ) {
  294. IOUtils.copy(inputStream, outputStream);
  295. }
  296. XWPFDocument document = new XWPFDocument(new FileInputStream(tempFile));
  297. tempFile.delete();
  298. tempFile1.delete();
  299. }
  300. //最终方法
  301. @Test
  302. public void test118() {
  303. // String path = "F:\\file\\测试\\word对比\\240722-S2418519-测试卷-PACN2414802-一种用于降低GIDL的混合型Gate结构及制备-申请全文-v1F.docx";
  304. String path = "F:\\file\\测试\\word对比\\S2418519-测试卷-PACN2414802-一种用于降低GIDL的混合型Gate结构及制备-权利要求书-v1r01-sq.docx";
  305. // String path = "F:\\file\\测试\\word对比\\AAA.docx";
  306. // String path1 = "F:\\file\\测试\\word对比\\240805-S2418519-测试卷-PACN2414802-一种用于降低GIDL的混合型Gate结构及制备-申请全文-v3F-清洁版.docx";
  307. String path1 = "F:\\file\\测试\\word对比\\240722-S2418519-测试卷-PACN2414802-一种用于降低GIDL的混合型Gate结构及制备-申请全文-v1F.docx";
  308. try {
  309. // String text = "";
  310. // String text1 = "";
  311. List<String> list = new ArrayList<>();
  312. List<String> list1 = new ArrayList<>();
  313. if (path.endsWith(".docx")) {
  314. XWPFDocument document = new XWPFDocument(new FileInputStream(path));
  315. List<XWPFParagraph> paragraphs = document.getParagraphs();
  316. for (XWPFParagraph paragraph : paragraphs) {
  317. final String s = paragraph.getText().trim();
  318. if (StringUtils.isNotEmpty(s)) {
  319. list.add(s);
  320. }
  321. }
  322. // XWPFWordExtractor extractor = new XWPFWordExtractor(document);
  323. // text = extractor.getText();
  324. document.close();
  325. } else if (path.endsWith(".doc")) {
  326. InputStream inputStream = new FileInputStream(path);
  327. WordExtractor wordExtractor = new WordExtractor(inputStream);
  328. String[] paragraphText = wordExtractor.getParagraphText();
  329. for (String s : paragraphText) {
  330. String trim = s.trim();
  331. if (StringUtils.isNotEmpty(trim.trim())) {
  332. list.add(trim);
  333. }
  334. }
  335. // list = Arrays.asList(paragraphText);
  336. // text = wordExtractor.getText();
  337. }
  338. if (path1.endsWith(".docx")) {
  339. XWPFDocument document1 = new XWPFDocument(new FileInputStream(path1));
  340. List<XWPFParagraph> paragraphs = document1.getParagraphs();
  341. for (XWPFParagraph paragraph : paragraphs) {
  342. final String s = paragraph.getText().trim();
  343. if (StringUtils.isNotEmpty(s)) {
  344. list1.add(s);
  345. }
  346. }
  347. // XWPFWordExtractor extractor1 = new XWPFWordExtractor(document1);
  348. // text1 = extractor1.getText();
  349. // 关闭流
  350. document1.close();
  351. } else if (path1.endsWith(".doc")) {
  352. InputStream inputStream = new FileInputStream(path1);
  353. WordExtractor wordExtractor = new WordExtractor(inputStream);
  354. String[] paragraphText = wordExtractor.getParagraphText();
  355. list1 = Arrays.asList(paragraphText);
  356. // text1 = wordExtractor.getText();
  357. }
  358. Map<String, String> firstMap = this.getText(list);
  359. Map<String, String> secondMap = this.getText(list1);
  360. final String text = StringUtils.join(list, "\n");
  361. final String text1 = StringUtils.join(list1, "\n");
  362. System.out.println("text:" + text);
  363. System.out.println("text1:" + text1);
  364. compareText1(text, text1, 1);
  365. compareText1(firstMap.get("abstract"), secondMap.get("abstract"), 2);
  366. compareText1(firstMap.get("claims"), secondMap.get("claims"), 3);
  367. compareText1(firstMap.get("description"), secondMap.get("description"), 4);
  368. } catch (Exception e) {
  369. e.printStackTrace();
  370. }
  371. }
  372. public Map<String, String> getText(List<String> list) {
  373. List<String> abstractSection = new ArrayList<>();
  374. List<String> claimsSection = new ArrayList<>();
  375. List<String> descriptionSection = new ArrayList<>();
  376. int abstractIndex = list.indexOf("说 明 书 摘 要");
  377. int claimsIndex = list.indexOf("权 利 要 求 书");
  378. int descriptionIndex = list.indexOf("说 明 书");
  379. if (abstractIndex >= 0) {
  380. int end = (claimsIndex > 0 && claimsIndex > abstractIndex) ? claimsIndex : list.size();
  381. abstractSection = list.subList(abstractIndex, end);
  382. }
  383. if (claimsIndex >= 0) {
  384. int end = (descriptionIndex > 0 && descriptionIndex > claimsIndex) ? descriptionIndex : list.size();
  385. claimsSection = list.subList(claimsIndex, end);
  386. }
  387. if (descriptionIndex >= 0) {
  388. descriptionSection = list.subList(descriptionIndex, list.size());
  389. }
  390. String abstractStr = StringUtils.join(abstractSection, "\n");
  391. String claimsStr = StringUtils.join(claimsSection, "\n");
  392. String descriptionStr = StringUtils.join(descriptionSection, "\n");
  393. Map<String, String> map = new HashMap<>();
  394. map.put("abstract", abstractStr);
  395. map.put("claims", claimsStr);
  396. map.put("description", descriptionStr);
  397. return map;
  398. }
  399. public void compareText1(String text, String text1, Integer type) {
  400. double similarity = cosineSimilarityService.calculateCosineSimilarity(text, text1);
  401. int total = text.length();
  402. System.out.println("firstDoc:" + total);
  403. StringsComparator comparator = new StringsComparator(text, text1);
  404. EditScript<Character> script = comparator.getScript();
  405. SectionDiffCommandVisitor commandVisitor = new SectionDiffCommandVisitor();
  406. script.visit(commandVisitor);
  407. commandVisitor.finish();
  408. String secondDoc = commandVisitor.getRightTemp().toString();
  409. String secondDocReplace = secondDoc.replaceAll("(\\r\\n|\\n)", "<br>");
  410. String html = "<p>" + secondDocReplace + "</p>";
  411. // 使用Jsoup解析HTML
  412. Document doc = Jsoup.parse(html);
  413. Elements paragraphs = doc.select("p");
  414. String html3 = doc.html();
  415. // String secondDocReplace1 = html3.replaceAll("(\\r\\n|\\n)", "<br>");
  416. System.out.println(html3);
  417. // 创建Word文档
  418. XWPFDocument wordDocument = new XWPFDocument();
  419. // 遍历每个段落
  420. for (Element paragraph : paragraphs) {
  421. List<String> emList = new ArrayList<>();
  422. List<String> delList = new ArrayList<>();
  423. // 遍历段落中的每个节点
  424. for (Node node : paragraph.childNodes()) {
  425. if (node instanceof Element element) {
  426. // 处理HTML元素
  427. if ("em".equalsIgnoreCase(element.tagName())) {
  428. String s = element.text();
  429. if (StringUtils.isNotEmpty(s)) {
  430. emList.add(s);
  431. }
  432. } else if ("del".equalsIgnoreCase(element.tagName())) {
  433. String s = element.text();
  434. if (StringUtils.isNotEmpty(s)) {
  435. delList.add(s);
  436. }
  437. }
  438. }
  439. }
  440. //修改处数
  441. int emNum = 0;
  442. //修改字数
  443. int emSum = 0;
  444. //删除处数
  445. int delNum = 0;
  446. //删除字数
  447. int delSum = 0;
  448. if (!CollectionUtils.isEmpty(emList)) {
  449. emSum = emList.stream().distinct().mapToInt(String::length).sum();
  450. emNum = (int) emList.stream().distinct().count();
  451. }
  452. if (!CollectionUtils.isEmpty(delList)) {
  453. delSum = delList.stream().distinct().mapToInt(String::length).sum();
  454. delNum = (int) delList.stream().distinct().count();
  455. }
  456. System.out.println("Em:" + emNum);
  457. System.out.println("Del:" + delNum);
  458. System.out.println("Total:" + total);
  459. //修改总处数
  460. int editSum = emNum + delNum;
  461. int editWordNum = emSum + delSum;
  462. BigDecimal sumBig = new BigDecimal(editWordNum);
  463. BigDecimal diff = new BigDecimal(0);
  464. if (total != 0) {
  465. BigDecimal totalBig = new BigDecimal(total);
  466. diff = sumBig.divide(totalBig, 6, RoundingMode.HALF_UP);
  467. } else {
  468. diff = new BigDecimal(1);
  469. similarity = 0L;
  470. }
  471. System.out.println("Diff:" + diff);
  472. System.out.println("similarity:" + String.format("%.6f", similarity));
  473. System.out.println("AAAAAAAAAAAAAA");
  474. // DiscrepancyDetail detail = new DiscrepancyDetail();
  475. // detail.setDiscrepancyId(0);
  476. // detail.setTotalWorldCount(total);
  477. // detail.setEditCount(editSum);
  478. // detail.setEditWorldCount(editWordNum);
  479. // detail.setDelWorldCount(delSum);
  480. // detail.setDiffType(type);
  481. // detail.setDiffContent(secondDocReplace);
  482. // detail.setRate(diff.toString());
  483. // detail.setSimilarity(String.format("%.4f", similarity));
  484. // detail.insert();
  485. }
  486. // 关闭文档
  487. try {
  488. wordDocument.close();
  489. } catch (IOException e) {
  490. e.printStackTrace();
  491. }
  492. }
  493. @Test
  494. public void test119() {
  495. // String path = "F:\\file\\测试\\word对比\\AAA-copy.doc";
  496. String path = "F:\\file\\测试\\word对比\\AAA.docx";
  497. String path1 = "F:\\file\\测试\\word对比\\BBBB.docx";
  498. try {
  499. String text = "";
  500. String text1 = "";
  501. if (path.endsWith(".docx")) {
  502. XWPFDocument document = new XWPFDocument(new FileInputStream(path));
  503. XWPFWordExtractor extractor = new XWPFWordExtractor(document);
  504. text = extractor.getText();
  505. document.close();
  506. } else if (path.endsWith(".doc")) {
  507. InputStream inputStream = new FileInputStream(path);
  508. WordExtractor wordExtractor = new WordExtractor(inputStream);
  509. final String[] paragraphText = wordExtractor.getParagraphText();
  510. text = wordExtractor.getText();
  511. }
  512. if (path1.endsWith(".docx")) {
  513. XWPFDocument document1 = new XWPFDocument(new FileInputStream(path1));
  514. XWPFWordExtractor extractor1 = new XWPFWordExtractor(document1);
  515. text1 = extractor1.getText();
  516. // 关闭流
  517. document1.close();
  518. } else if (path1.endsWith(".doc")) {
  519. InputStream inputStream = new FileInputStream(path1);
  520. WordExtractor wordExtractor = new WordExtractor(inputStream);
  521. text1 = wordExtractor.getText();
  522. }
  523. compareText2(text, text1);
  524. } catch (Exception e) {
  525. e.printStackTrace();
  526. }
  527. }
  528. public void compareText2(String text, String text1) {
  529. double su = cosineSimilarityService.calculateCosineSimilarity(text1, text);
  530. int total = text.length();
  531. int total1 = text1.length();
  532. System.out.println("firstDoc:" + total);
  533. // commons-text
  534. StringsComparator comparator = new StringsComparator(text, text1);
  535. EditScript<Character> script = comparator.getScript();
  536. SectionDiffCommandVisitor commandVisitor = new SectionDiffCommandVisitor();
  537. script.visit(commandVisitor);
  538. commandVisitor.finish();
  539. // System.out.println(commandVisitor.getLeftTemp());
  540. // System.out.println(commandVisitor.getRightTemp());
  541. String secondDoc = commandVisitor.getRightTemp().toString();
  542. String html = "<p>" + secondDoc + "</p>";
  543. // System.out.println(html);
  544. // 使用Jsoup解析HTML
  545. Document doc = Jsoup.parse(html);
  546. Elements paragraphs = doc.select("p");
  547. String html3 = doc.html();
  548. String secondDocReplace = html3.replace("(\r\n|\n)", "<br>");
  549. System.out.println(secondDocReplace);
  550. // 创建Word文档
  551. XWPFDocument wordDocument = new XWPFDocument();
  552. // 遍历每个段落
  553. for (Element paragraph : paragraphs) {
  554. List<String> emList = new ArrayList<>();
  555. List<String> delList = new ArrayList<>();
  556. // 遍历段落中的每个节点
  557. for (Node node : paragraph.childNodes()) {
  558. if (node instanceof Element element) {
  559. // 处理HTML元素
  560. if ("em".equalsIgnoreCase(element.tagName())) {
  561. String s = element.text();
  562. emList.add(s);
  563. } else if ("del".equalsIgnoreCase(element.tagName())) {
  564. String s = element.text();
  565. delList.add(s);
  566. }
  567. }
  568. }
  569. //修改处数
  570. int emNum = 0;
  571. //修改字数
  572. int emSum = 0;
  573. //删除处数
  574. int delNum = 0;
  575. //删除字数
  576. int delSum = 0;
  577. if (!CollectionUtils.isEmpty(emList)) {
  578. emSum = emList.stream().mapToInt(String::length).sum();
  579. emNum = (int) emList.stream().distinct().count();
  580. }
  581. if (!CollectionUtils.isEmpty(delList)) {
  582. delSum = delList.stream().mapToInt(String::length).sum();
  583. delNum = (int) delList.stream().distinct().count();
  584. }
  585. System.out.println("Em:" + emNum);
  586. System.out.println("Del:" + delNum);
  587. System.out.println("Total:" + total);
  588. //修改总处数
  589. int editSum = emNum + delNum;
  590. int editWordNum = emSum + delSum;
  591. BigDecimal sumBig = new BigDecimal(editWordNum);
  592. BigDecimal totalBig = new BigDecimal(total);
  593. BigDecimal diff = sumBig.divide(totalBig, 4, RoundingMode.HALF_UP);
  594. System.out.println("Diff:" + diff);
  595. System.out.println("similar:" + String.format("%.4f", su));
  596. }
  597. // 关闭文档
  598. try {
  599. wordDocument.close();
  600. } catch (IOException e) {
  601. e.printStackTrace();
  602. }
  603. }
  604. @Test
  605. public void test120() throws IOException {
  606. // String path = "F:\\file\\测试\\word对比\\AAA-copy.doc";
  607. // String path = "F:\\file\\测试\\word对比\\AAA.docx";
  608. // String path1 = "F:\\file\\测试\\word对比\\BBBB.docx";
  609. String path = "F:\\file\\测试\\word对比\\S2418519-测试卷-PACN2414802-一种用于降低GIDL的混合型Gate结构及制备-权利要求书-v1r01-sq.docx";
  610. String path1 = "F:\\file\\测试\\word对比\\240722-S2418519-测试卷-PACN2414802-一种用于降低GIDL的混合型Gate结构及制备-申请全文-v1F.docx";
  611. List<String> list = new ArrayList<>();
  612. List<String> list1 = new ArrayList<>();
  613. try {
  614. String text = "";
  615. String text1 = "";
  616. if (path.endsWith(".docx")) {
  617. XWPFDocument document = new XWPFDocument(new FileInputStream(path));
  618. List<XWPFParagraph> paragraphs = document.getParagraphs();
  619. for (XWPFParagraph paragraph : paragraphs) {
  620. final String s = paragraph.getText().trim();
  621. if (StringUtils.isNotEmpty(s)) {
  622. list.add(s);
  623. }
  624. }
  625. XWPFWordExtractor extractor = new XWPFWordExtractor(document);
  626. text = extractor.getText();
  627. document.close();
  628. } else if (path.endsWith(".doc")) {
  629. InputStream inputStream = new FileInputStream(path);
  630. WordExtractor wordExtractor = new WordExtractor(inputStream);
  631. String[] paragraphText = wordExtractor.getParagraphText();
  632. for (String s : paragraphText) {
  633. if (StringUtils.isNotEmpty(s)) {
  634. list.add(s);
  635. }
  636. }
  637. list = Arrays.asList(paragraphText);
  638. text = wordExtractor.getText();
  639. }
  640. if (path1.endsWith(".docx")) {
  641. XWPFDocument document1 = new XWPFDocument(new FileInputStream(path1));
  642. List<XWPFParagraph> paragraphs = document1.getParagraphs();
  643. for (XWPFParagraph paragraph : paragraphs) {
  644. String s = paragraph.getText().trim();
  645. if (StringUtils.isNotEmpty(s)) {
  646. list1.add(s);
  647. }
  648. }
  649. XWPFWordExtractor extractor1 = new XWPFWordExtractor(document1);
  650. text1 = extractor1.getText();
  651. // 关闭流
  652. document1.close();
  653. } else if (path1.endsWith(".doc")) {
  654. InputStream inputStream = new FileInputStream(path1);
  655. WordExtractor wordExtractor = new WordExtractor(inputStream);
  656. text1 = wordExtractor.getText();
  657. }
  658. // final double cscwv = cosineSimilarityService.calculateAverageCSCWV(list, list1);
  659. // final double su = cosineSimilarityService.calculateCosineSimilarity(text1, text);
  660. // final double su1 = cosineSimilarityService.calculateCosineSimilarity(text, text);
  661. // System.out.println(su);
  662. Map<String, String> firstMap = this.getText(list);
  663. Map<String, String> secondMap = this.getText(list1);
  664. System.out.println(firstMap);
  665. } catch (Exception e) {
  666. e.printStackTrace();
  667. }
  668. }
  669. }