package cn.cslg.wdc; import cn.cslg.wdc.common.exception.XiaoShiException; import cn.cslg.wdc.dto.SelectCaseInfoDTO; import cn.cslg.wdc.dto.common.SectionDiffCommandVisitor; import cn.cslg.wdc.entity.AssoCaseFile; import cn.cslg.wdc.entity.CaseFile; import cn.cslg.wdc.entity.Discrepancy; import cn.cslg.wdc.mapper.AssoCaseFileMapper; import cn.cslg.wdc.mapper.CaseFileMapper; import cn.cslg.wdc.service.CaseFileService; import cn.cslg.wdc.service.DiscrepancyService; import cn.cslg.wdc.service.common.CosineSimilarityService; import cn.cslg.wdc.service.common.FileManagerService; import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper; import org.apache.commons.compress.utils.IOUtils; import org.apache.commons.lang3.ObjectUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.text.diff.EditScript; import org.apache.commons.text.diff.StringsComparator; import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.xwpf.extractor.XWPFWordExtractor; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFParagraph; import org.apache.poi.xwpf.usermodel.XWPFRun; import org.checkerframework.checker.units.qual.A; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; import org.jsoup.select.Elements; import org.junit.jupiter.api.Test; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.test.context.SpringBootTest; import org.springframework.util.CollectionUtils; import java.io.*; import java.math.BigDecimal; import java.math.RoundingMode; import java.util.*; @SpringBootTest class WdcApplicationTests { @Autowired private AssoCaseFileMapper assoCaseFileMapper; @Autowired private CaseFileMapper caseFileMapper; @Autowired private DiscrepancyService discrepancyService; @Autowired private CosineSimilarityService cosineSimilarityService; @Autowired private FileManagerService fileManagerService; @Autowired private CaseFileService caseFileService; @Test void contextLoads() throws IOException { String s1 = "46283d7fdae3413491da50dfd3b92364"; String s2 = "f8d77ae1ce2f4d298064ecaecf5ff301"; Integer s = 8; String s3 = "241023-P20241102-PACN2417839-翻译方法、翻译装置、电子设备以及计算机可读存储介质-专利申请文件-v1F.docx"; String s4 = "241024-S2435631-测试卷-压缩包-语音翻译方法及装置、电子设备以及计算机可读存储介质-新申请文档-定稿.docx"; caseFileService.getDiscrepancyByFile(s1, s2, s, s3, s4); System.out.println("AAAAAA"); // final Long count = discrepancyService.getBaseMapper().selectCount(new LambdaQueryWrapper()); // System.out.println(count); // Discrepancy discrepancy = new Discrepancy(); // discrepancy.setCaseNo("sajdsak"); // discrepancy.setFirstDiscrepancy("0.36"); // discrepancy.setSecondDiscrepancy("0.66"); // discrepancy.insert(); } private static void processNode(XWPFParagraph paragraph, Node node) { XWPFRun run = paragraph.createRun(); if (node instanceof TextNode) { run.setText(((TextNode) node).text()); } else if (node instanceof Element) { Element element = (Element) node; String tagName = element.tagName(); if ("em".equalsIgnoreCase(tagName)) { run.setText(element.text()); run.setItalic(true); } else if ("del".equalsIgnoreCase(tagName)) { run.setText(element.text()); run.setStrike(true); } else { // 递归处理其他标签(如果有) for (Node childNode : element.childNodes()) { processNode(paragraph, childNode); } } } } @Test public void test1() throws Exception { String path = "F:\\file\\测试\\word对比\\CCC.docx"; XWPFDocument document = new XWPFDocument(new FileInputStream(path)); XWPFWordExtractor extractor = new XWPFWordExtractor(document); String text = extractor.getText(); System.out.println(text); document.close(); Document doc = Jsoup.parse(text); Elements paragraphs = doc.select("p"); } @Test public void test117() { // String path = "F:\\file\\测试\\word对比\\S2418519-测试卷-PACN2414802-一种用于降低GIDL的混合型Gate结构及制备-权利要求书-v1r01-sq.docx"; // String path1 = "F:\\file\\测试\\word对比\\240722-S2418519-测试卷-PACN2414802-一种用于降低GIDL的混合型Gate结构及制备-申请全文-v1F.docx"; // String path = "F:\\file\\测试\\word对比\\240722-S2418519-测试卷-PACN2414802-一种用于降低GIDL的混合型Gate结构及制备-申请全文-v1F.docx"; // String path1 = "F:\\file\\测试\\word对比\\240805-S2418519-测试卷-PACN2414802-一种用于降低GIDL的混合型Gate结构及制备-申请全文-v3F-清洁版.docx"; String path = "F:\\file\\测试\\word对比\\AAA.docx"; String path1 = "F:\\file\\测试\\word对比\\BBBB.docx"; if (!path.endsWith(".doc") && !path.endsWith(".docx")) { throw new XiaoShiException("请上传Word文件"); } if (!path1.endsWith(".doc") && !path1.endsWith(".docx")) { throw new XiaoShiException("请上传Word文件"); } try { String text = ""; String text1 = ""; if (path.endsWith(".docx")) { XWPFDocument document = new XWPFDocument(new FileInputStream(path)); XWPFWordExtractor extractor = new XWPFWordExtractor(document); text = extractor.getText(); // System.out.println(text); document.close(); } else if (path.endsWith(".doc")) { InputStream inputStream = new FileInputStream(path); WordExtractor wordExtractor = new WordExtractor(inputStream); text = wordExtractor.getText(); } if (path1.endsWith(".docx")) { XWPFDocument document1 = new XWPFDocument(new FileInputStream(path1)); // List xwpfParagraphs= document1.getParagraphs(); // xwpfParagraphs.forEach(item->{ // System.out.println(item.getText()); // }); XWPFWordExtractor extractor1 = new XWPFWordExtractor(document1); text1 = extractor1.getText(); // System.out.println(text1); // 关闭流 document1.close(); } else if (path1.endsWith(".doc")) { InputStream inputStream = new FileInputStream(path1); WordExtractor wordExtractor = new WordExtractor(inputStream); text1 = wordExtractor.getText(); } compareText(text, text1); } catch (Exception e) { e.printStackTrace(); } } public static void compareText(String text, String text1) { // commons-text StringsComparator comparator = new StringsComparator(text, text1); EditScript script = comparator.getScript(); SectionDiffCommandVisitor commandVisitor = new SectionDiffCommandVisitor(); script.visit(commandVisitor); commandVisitor.finish(); // System.out.println(commandVisitor.getLeftTemp()); // System.out.println(commandVisitor.getRightTemp()); String firstDoc = commandVisitor.getLeftTemp().toString(); int firstDocLen = firstDoc.length(); System.out.println("firstDoc:" + firstDocLen); String secondDoc = commandVisitor.getRightTemp().toString(); String secondDocReplace = secondDoc.replace("\n", "
"); String html = "

" + secondDocReplace + "

"; // System.out.println(html); // 使用Jsoup解析HTML Document doc = Jsoup.parse(html); Elements paragraphs = doc.select("p"); // 创建Word文档 XWPFDocument wordDocument = new XWPFDocument(); // 遍历每个段落 for (Element paragraph : paragraphs) { XWPFParagraph wordParagraph = wordDocument.createParagraph(); XWPFRun run = wordParagraph.createRun(); int emNum = 0; int delNum = 0; int total = 0; // 遍历段落中的每个节点 for (Node node : paragraph.childNodes()) { if (node instanceof TextNode) { try { boolean b = run.getText(0).isEmpty(); // System.out.println(run.getText(0)); if (!b) { if (org.apache.commons.lang3.StringUtils.isNotEmpty(run.getText(0))) { run = wordParagraph.createRun(); } } } catch (Exception e) { } // 处理纯文本节点 run.setText(((TextNode) node).text()); total += ((TextNode) node).text().length(); } else if (node instanceof Element) { // 处理HTML元素 Element element = (Element) node; if ("em".equalsIgnoreCase(element.tagName())) { String s = element.text(); // 应用斜体样式 run = wordParagraph.createRun(); run.setText(s); emNum += s.length(); run.setColor("FF0000"); run.setBold(true); run.setItalic(true); } else if ("del".equalsIgnoreCase(element.tagName())) { String s = element.text(); // 应用删除线样式 run = wordParagraph.createRun(); run.setText(s); delNum += s.length(); run.setColor("0000FF"); run.setBold(true); run.setStrike(true); } else if ("br".equalsIgnoreCase(element.tagName())) { run = wordParagraph.createRun(); run.addBreak(); } else { // 递归处理其他标签(如果有) for (Node childNode : element.childNodes()) { processNode(wordParagraph, childNode); } } } } System.out.println("Em:" + emNum); System.out.println("Del:" + delNum); System.out.println("Total:" + total); int sum = emNum + delNum; final BigDecimal sumBig = new BigDecimal(sum); final BigDecimal totalBig = new BigDecimal(total); BigDecimal diff = sumBig.divide(totalBig, 2, RoundingMode.HALF_UP) .multiply(new BigDecimal(100)); System.out.println("Diff:" + diff + "%"); } // 将文档写入文件 try (FileOutputStream out = new FileOutputStream("F:\\file\\测试\\word对比\\CCC.docx")) { wordDocument.write(out); } catch (IOException e) { e.printStackTrace(); } // 关闭文档 try { wordDocument.close(); } catch (IOException e) { e.printStackTrace(); } } public void addDiff(String caseNo) throws IOException { CaseFile caseFile = caseFileMapper.selectOne(new LambdaQueryWrapper() .eq(CaseFile::getCaseNo, caseNo)); if (ObjectUtils.isNotEmpty(caseFile)) { List assoCaseFiles = assoCaseFileMapper.selectList(new LambdaQueryWrapper() .eq(AssoCaseFile::getCaseId, caseFile.getId())); AssoCaseFile caseFile1 = assoCaseFiles.stream().filter(i -> i.getFileType() == 1).findFirst().orElse(new AssoCaseFile()); AssoCaseFile caseFile2 = assoCaseFiles.stream().filter(i -> i.getFileType() == 3).findFirst().orElse(new AssoCaseFile()); AssoCaseFile caseFile3 = assoCaseFiles.stream().filter(i -> i.getFileType() == 3).findFirst().orElse(new AssoCaseFile()); if (ObjectUtils.isNotEmpty(caseFile1) && ObjectUtils.isNotEmpty(caseFile2)) { Discrepancy discrepancy = new Discrepancy(); discrepancy.setCaseId(caseFile.getId()); discrepancy.setCaseFileId1(caseFile1.getId()); discrepancy.setCaseFileId2(caseFile2.getId()); discrepancy.setDiscrepancyType(1); discrepancy.insert(); this.getFile(caseFile1.getFileGuid(), caseFile2.getFileGuid(), discrepancy.getId()); } if (ObjectUtils.isNotEmpty(caseFile2) && ObjectUtils.isNotEmpty(caseFile3)) { Discrepancy discrepancy = new Discrepancy(); discrepancy.setCaseId(caseFile.getId()); discrepancy.setCaseFileId1(caseFile2.getId()); discrepancy.setCaseFileId2(caseFile3.getId()); discrepancy.setDiscrepancyType(2); discrepancy.insert(); } } } public void getFile(String guid1, String guid2, Integer discrepancyId) throws IOException { byte[] bytes = fileManagerService.downloadSystemFileFromFMS(guid1); File tempFile = File.createTempFile("temp1_", ".docx"); try ( InputStream inputStream = new ByteArrayInputStream(bytes); FileOutputStream outputStream = new FileOutputStream(tempFile) ) { IOUtils.copy(inputStream, outputStream); } byte[] bytes1 = fileManagerService.downloadSystemFileFromFMS(guid2); File tempFile1 = File.createTempFile("temp2_", ".docx"); try ( InputStream inputStream = new ByteArrayInputStream(bytes1); FileOutputStream outputStream = new FileOutputStream(tempFile1) ) { IOUtils.copy(inputStream, outputStream); } XWPFDocument document = new XWPFDocument(new FileInputStream(tempFile)); tempFile.delete(); tempFile1.delete(); } //最终方法 @Test public void test118() { // String path = "F:\\file\\测试\\word对比\\240722-S2418519-测试卷-PACN2414802-一种用于降低GIDL的混合型Gate结构及制备-申请全文-v1F.docx"; String path = "F:\\file\\测试\\word对比\\S2418519-测试卷-PACN2414802-一种用于降低GIDL的混合型Gate结构及制备-权利要求书-v1r01-sq.docx"; // String path = "F:\\file\\测试\\word对比\\AAA.docx"; // String path1 = "F:\\file\\测试\\word对比\\240805-S2418519-测试卷-PACN2414802-一种用于降低GIDL的混合型Gate结构及制备-申请全文-v3F-清洁版.docx"; String path1 = "F:\\file\\测试\\word对比\\240722-S2418519-测试卷-PACN2414802-一种用于降低GIDL的混合型Gate结构及制备-申请全文-v1F.docx"; try { // String text = ""; // String text1 = ""; List list = new ArrayList<>(); List list1 = new ArrayList<>(); if (path.endsWith(".docx")) { XWPFDocument document = new XWPFDocument(new FileInputStream(path)); List paragraphs = document.getParagraphs(); for (XWPFParagraph paragraph : paragraphs) { final String s = paragraph.getText().trim(); if (StringUtils.isNotEmpty(s)) { list.add(s); } } // XWPFWordExtractor extractor = new XWPFWordExtractor(document); // text = extractor.getText(); document.close(); } else if (path.endsWith(".doc")) { InputStream inputStream = new FileInputStream(path); WordExtractor wordExtractor = new WordExtractor(inputStream); String[] paragraphText = wordExtractor.getParagraphText(); for (String s : paragraphText) { String trim = s.trim(); if (StringUtils.isNotEmpty(trim.trim())) { list.add(trim); } } // list = Arrays.asList(paragraphText); // text = wordExtractor.getText(); } if (path1.endsWith(".docx")) { XWPFDocument document1 = new XWPFDocument(new FileInputStream(path1)); List paragraphs = document1.getParagraphs(); for (XWPFParagraph paragraph : paragraphs) { final String s = paragraph.getText().trim(); if (StringUtils.isNotEmpty(s)) { list1.add(s); } } // XWPFWordExtractor extractor1 = new XWPFWordExtractor(document1); // text1 = extractor1.getText(); // 关闭流 document1.close(); } else if (path1.endsWith(".doc")) { InputStream inputStream = new FileInputStream(path1); WordExtractor wordExtractor = new WordExtractor(inputStream); String[] paragraphText = wordExtractor.getParagraphText(); list1 = Arrays.asList(paragraphText); // text1 = wordExtractor.getText(); } Map firstMap = this.getText(list); Map secondMap = this.getText(list1); final String text = StringUtils.join(list, "\n"); final String text1 = StringUtils.join(list1, "\n"); System.out.println("text:" + text); System.out.println("text1:" + text1); compareText1(text, text1, 1); compareText1(firstMap.get("abstract"), secondMap.get("abstract"), 2); compareText1(firstMap.get("claims"), secondMap.get("claims"), 3); compareText1(firstMap.get("description"), secondMap.get("description"), 4); } catch (Exception e) { e.printStackTrace(); } } public Map getText(List list) { List abstractSection = new ArrayList<>(); List claimsSection = new ArrayList<>(); List descriptionSection = new ArrayList<>(); int abstractIndex = list.indexOf("说 明 书 摘 要"); int claimsIndex = list.indexOf("权 利 要 求 书"); int descriptionIndex = list.indexOf("说 明 书"); if (abstractIndex >= 0) { int end = (claimsIndex > 0 && claimsIndex > abstractIndex) ? claimsIndex : list.size(); abstractSection = list.subList(abstractIndex, end); } if (claimsIndex >= 0) { int end = (descriptionIndex > 0 && descriptionIndex > claimsIndex) ? descriptionIndex : list.size(); claimsSection = list.subList(claimsIndex, end); } if (descriptionIndex >= 0) { descriptionSection = list.subList(descriptionIndex, list.size()); } String abstractStr = StringUtils.join(abstractSection, "\n"); String claimsStr = StringUtils.join(claimsSection, "\n"); String descriptionStr = StringUtils.join(descriptionSection, "\n"); Map map = new HashMap<>(); map.put("abstract", abstractStr); map.put("claims", claimsStr); map.put("description", descriptionStr); return map; } public void compareText1(String text, String text1, Integer type) { double similarity = cosineSimilarityService.calculateCosineSimilarity(text, text1); int total = text.length(); System.out.println("firstDoc:" + total); StringsComparator comparator = new StringsComparator(text, text1); EditScript script = comparator.getScript(); SectionDiffCommandVisitor commandVisitor = new SectionDiffCommandVisitor(); script.visit(commandVisitor); commandVisitor.finish(); String secondDoc = commandVisitor.getRightTemp().toString(); String secondDocReplace = secondDoc.replaceAll("(\\r\\n|\\n)", "
"); String html = "

" + secondDocReplace + "

"; // 使用Jsoup解析HTML Document doc = Jsoup.parse(html); Elements paragraphs = doc.select("p"); String html3 = doc.html(); // String secondDocReplace1 = html3.replaceAll("(\\r\\n|\\n)", "
"); System.out.println(html3); // 创建Word文档 XWPFDocument wordDocument = new XWPFDocument(); // 遍历每个段落 for (Element paragraph : paragraphs) { List emList = new ArrayList<>(); List delList = new ArrayList<>(); // 遍历段落中的每个节点 for (Node node : paragraph.childNodes()) { if (node instanceof Element element) { // 处理HTML元素 if ("em".equalsIgnoreCase(element.tagName())) { String s = element.text(); if (StringUtils.isNotEmpty(s)) { emList.add(s); } } else if ("del".equalsIgnoreCase(element.tagName())) { String s = element.text(); if (StringUtils.isNotEmpty(s)) { delList.add(s); } } } } //修改处数 int emNum = 0; //修改字数 int emSum = 0; //删除处数 int delNum = 0; //删除字数 int delSum = 0; if (!CollectionUtils.isEmpty(emList)) { emSum = emList.stream().distinct().mapToInt(String::length).sum(); emNum = (int) emList.stream().distinct().count(); } if (!CollectionUtils.isEmpty(delList)) { delSum = delList.stream().distinct().mapToInt(String::length).sum(); delNum = (int) delList.stream().distinct().count(); } System.out.println("Em:" + emNum); System.out.println("Del:" + delNum); System.out.println("Total:" + total); //修改总处数 int editSum = emNum + delNum; int editWordNum = emSum + delSum; BigDecimal sumBig = new BigDecimal(editWordNum); BigDecimal diff = new BigDecimal(0); if (total != 0) { BigDecimal totalBig = new BigDecimal(total); diff = sumBig.divide(totalBig, 6, RoundingMode.HALF_UP); } else { diff = new BigDecimal(1); similarity = 0L; } System.out.println("Diff:" + diff); System.out.println("similarity:" + String.format("%.6f", similarity)); System.out.println("AAAAAAAAAAAAAA"); // DiscrepancyDetail detail = new DiscrepancyDetail(); // detail.setDiscrepancyId(0); // detail.setTotalWorldCount(total); // detail.setEditCount(editSum); // detail.setEditWorldCount(editWordNum); // detail.setDelWorldCount(delSum); // detail.setDiffType(type); // detail.setDiffContent(secondDocReplace); // detail.setRate(diff.toString()); // detail.setSimilarity(String.format("%.4f", similarity)); // detail.insert(); } // 关闭文档 try { wordDocument.close(); } catch (IOException e) { e.printStackTrace(); } } @Test public void test119() { // String path = "F:\\file\\测试\\word对比\\AAA-copy.doc"; String path = "F:\\file\\测试\\word对比\\AAA.docx"; String path1 = "F:\\file\\测试\\word对比\\BBBB.docx"; try { String text = ""; String text1 = ""; if (path.endsWith(".docx")) { XWPFDocument document = new XWPFDocument(new FileInputStream(path)); XWPFWordExtractor extractor = new XWPFWordExtractor(document); text = extractor.getText(); document.close(); } else if (path.endsWith(".doc")) { InputStream inputStream = new FileInputStream(path); WordExtractor wordExtractor = new WordExtractor(inputStream); final String[] paragraphText = wordExtractor.getParagraphText(); text = wordExtractor.getText(); } if (path1.endsWith(".docx")) { XWPFDocument document1 = new XWPFDocument(new FileInputStream(path1)); XWPFWordExtractor extractor1 = new XWPFWordExtractor(document1); text1 = extractor1.getText(); // 关闭流 document1.close(); } else if (path1.endsWith(".doc")) { InputStream inputStream = new FileInputStream(path1); WordExtractor wordExtractor = new WordExtractor(inputStream); text1 = wordExtractor.getText(); } compareText2(text, text1); } catch (Exception e) { e.printStackTrace(); } } public void compareText2(String text, String text1) { double su = cosineSimilarityService.calculateCosineSimilarity(text1, text); int total = text.length(); int total1 = text1.length(); System.out.println("firstDoc:" + total); // commons-text StringsComparator comparator = new StringsComparator(text, text1); EditScript script = comparator.getScript(); SectionDiffCommandVisitor commandVisitor = new SectionDiffCommandVisitor(); script.visit(commandVisitor); commandVisitor.finish(); // System.out.println(commandVisitor.getLeftTemp()); // System.out.println(commandVisitor.getRightTemp()); String secondDoc = commandVisitor.getRightTemp().toString(); String html = "

" + secondDoc + "

"; // System.out.println(html); // 使用Jsoup解析HTML Document doc = Jsoup.parse(html); Elements paragraphs = doc.select("p"); String html3 = doc.html(); String secondDocReplace = html3.replace("(\r\n|\n)", "
"); System.out.println(secondDocReplace); // 创建Word文档 XWPFDocument wordDocument = new XWPFDocument(); // 遍历每个段落 for (Element paragraph : paragraphs) { List emList = new ArrayList<>(); List delList = new ArrayList<>(); // 遍历段落中的每个节点 for (Node node : paragraph.childNodes()) { if (node instanceof Element element) { // 处理HTML元素 if ("em".equalsIgnoreCase(element.tagName())) { String s = element.text(); emList.add(s); } else if ("del".equalsIgnoreCase(element.tagName())) { String s = element.text(); delList.add(s); } } } //修改处数 int emNum = 0; //修改字数 int emSum = 0; //删除处数 int delNum = 0; //删除字数 int delSum = 0; if (!CollectionUtils.isEmpty(emList)) { emSum = emList.stream().mapToInt(String::length).sum(); emNum = (int) emList.stream().distinct().count(); } if (!CollectionUtils.isEmpty(delList)) { delSum = delList.stream().mapToInt(String::length).sum(); delNum = (int) delList.stream().distinct().count(); } System.out.println("Em:" + emNum); System.out.println("Del:" + delNum); System.out.println("Total:" + total); //修改总处数 int editSum = emNum + delNum; int editWordNum = emSum + delSum; BigDecimal sumBig = new BigDecimal(editWordNum); BigDecimal totalBig = new BigDecimal(total); BigDecimal diff = sumBig.divide(totalBig, 4, RoundingMode.HALF_UP); System.out.println("Diff:" + diff); System.out.println("similar:" + String.format("%.4f", su)); } // 关闭文档 try { wordDocument.close(); } catch (IOException e) { e.printStackTrace(); } } @Test public void test120() throws IOException { // String path = "F:\\file\\测试\\word对比\\AAA-copy.doc"; // String path = "F:\\file\\测试\\word对比\\AAA.docx"; // String path1 = "F:\\file\\测试\\word对比\\BBBB.docx"; String path = "F:\\file\\测试\\word对比\\S2418519-测试卷-PACN2414802-一种用于降低GIDL的混合型Gate结构及制备-权利要求书-v1r01-sq.docx"; String path1 = "F:\\file\\测试\\word对比\\240722-S2418519-测试卷-PACN2414802-一种用于降低GIDL的混合型Gate结构及制备-申请全文-v1F.docx"; List list = new ArrayList<>(); List list1 = new ArrayList<>(); try { String text = ""; String text1 = ""; if (path.endsWith(".docx")) { XWPFDocument document = new XWPFDocument(new FileInputStream(path)); List paragraphs = document.getParagraphs(); for (XWPFParagraph paragraph : paragraphs) { final String s = paragraph.getText().trim(); if (StringUtils.isNotEmpty(s)) { list.add(s); } } XWPFWordExtractor extractor = new XWPFWordExtractor(document); text = extractor.getText(); document.close(); } else if (path.endsWith(".doc")) { InputStream inputStream = new FileInputStream(path); WordExtractor wordExtractor = new WordExtractor(inputStream); String[] paragraphText = wordExtractor.getParagraphText(); for (String s : paragraphText) { if (StringUtils.isNotEmpty(s)) { list.add(s); } } list = Arrays.asList(paragraphText); text = wordExtractor.getText(); } if (path1.endsWith(".docx")) { XWPFDocument document1 = new XWPFDocument(new FileInputStream(path1)); List paragraphs = document1.getParagraphs(); for (XWPFParagraph paragraph : paragraphs) { String s = paragraph.getText().trim(); if (StringUtils.isNotEmpty(s)) { list1.add(s); } } XWPFWordExtractor extractor1 = new XWPFWordExtractor(document1); text1 = extractor1.getText(); // 关闭流 document1.close(); } else if (path1.endsWith(".doc")) { InputStream inputStream = new FileInputStream(path1); WordExtractor wordExtractor = new WordExtractor(inputStream); text1 = wordExtractor.getText(); } // final double cscwv = cosineSimilarityService.calculateAverageCSCWV(list, list1); // final double su = cosineSimilarityService.calculateCosineSimilarity(text1, text); // final double su1 = cosineSimilarityService.calculateCosineSimilarity(text, text); // System.out.println(su); Map firstMap = this.getText(list); Map secondMap = this.getText(list1); System.out.println(firstMap); } catch (Exception e) { e.printStackTrace(); } } }