123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714 |
- package cn.cslg.wdc;
- import cn.cslg.wdc.common.exception.XiaoShiException;
- import cn.cslg.wdc.dto.SelectCaseInfoDTO;
- import cn.cslg.wdc.dto.common.SectionDiffCommandVisitor;
- import cn.cslg.wdc.entity.AssoCaseFile;
- import cn.cslg.wdc.entity.CaseFile;
- import cn.cslg.wdc.entity.Discrepancy;
- import cn.cslg.wdc.mapper.AssoCaseFileMapper;
- import cn.cslg.wdc.mapper.CaseFileMapper;
- import cn.cslg.wdc.service.CaseFileService;
- import cn.cslg.wdc.service.DiscrepancyService;
- import cn.cslg.wdc.service.common.CosineSimilarityService;
- import cn.cslg.wdc.service.common.FileManagerService;
- import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
- import org.apache.commons.compress.utils.IOUtils;
- import org.apache.commons.lang3.ObjectUtils;
- import org.apache.commons.lang3.StringUtils;
- import org.apache.commons.text.diff.EditScript;
- import org.apache.commons.text.diff.StringsComparator;
- import org.apache.poi.hwpf.extractor.WordExtractor;
- import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
- import org.apache.poi.xwpf.usermodel.XWPFDocument;
- import org.apache.poi.xwpf.usermodel.XWPFParagraph;
- import org.apache.poi.xwpf.usermodel.XWPFRun;
- import org.checkerframework.checker.units.qual.A;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.nodes.Node;
- import org.jsoup.nodes.TextNode;
- import org.jsoup.select.Elements;
- import org.junit.jupiter.api.Test;
- import org.springframework.beans.factory.annotation.Autowired;
- import org.springframework.boot.test.context.SpringBootTest;
- import org.springframework.util.CollectionUtils;
- import java.io.*;
- import java.math.BigDecimal;
- import java.math.RoundingMode;
- import java.util.*;
- @SpringBootTest
- class WdcApplicationTests {
- @Autowired
- private AssoCaseFileMapper assoCaseFileMapper;
- @Autowired
- private CaseFileMapper caseFileMapper;
- @Autowired
- private DiscrepancyService discrepancyService;
- @Autowired
- private CosineSimilarityService cosineSimilarityService;
- @Autowired
- private FileManagerService fileManagerService;
- @Autowired
- private CaseFileService caseFileService;
- @Test
- void contextLoads() throws IOException {
- String s1 = "46283d7fdae3413491da50dfd3b92364";
- String s2 = "f8d77ae1ce2f4d298064ecaecf5ff301";
- Integer s = 8;
- String s3 = "241023-P20241102-PACN2417839-翻译方法、翻译装置、电子设备以及计算机可读存储介质-专利申请文件-v1F.docx";
- String s4 = "241024-S2435631-测试卷-压缩包-语音翻译方法及装置、电子设备以及计算机可读存储介质-新申请文档-定稿.docx";
- caseFileService.getDiscrepancyByFile(s1, s2, s, s3, s4);
- System.out.println("AAAAAA");
- // final Long count = discrepancyService.getBaseMapper().selectCount(new LambdaQueryWrapper<Discrepancy>());
- // System.out.println(count);
- // Discrepancy discrepancy = new Discrepancy();
- // discrepancy.setCaseNo("sajdsak");
- // discrepancy.setFirstDiscrepancy("0.36");
- // discrepancy.setSecondDiscrepancy("0.66");
- // discrepancy.insert();
- }
- private static void processNode(XWPFParagraph paragraph, Node node) {
- XWPFRun run = paragraph.createRun();
- if (node instanceof TextNode) {
- run.setText(((TextNode) node).text());
- } else if (node instanceof Element) {
- Element element = (Element) node;
- String tagName = element.tagName();
- if ("em".equalsIgnoreCase(tagName)) {
- run.setText(element.text());
- run.setItalic(true);
- } else if ("del".equalsIgnoreCase(tagName)) {
- run.setText(element.text());
- run.setStrike(true);
- } else {
- // 递归处理其他标签(如果有)
- for (Node childNode : element.childNodes()) {
- processNode(paragraph, childNode);
- }
- }
- }
- }
- @Test
- public void test1() throws Exception {
- String path = "F:\\file\\测试\\word对比\\CCC.docx";
- XWPFDocument document = new XWPFDocument(new FileInputStream(path));
- XWPFWordExtractor extractor = new XWPFWordExtractor(document);
- String text = extractor.getText();
- System.out.println(text);
- document.close();
- Document doc = Jsoup.parse(text);
- Elements paragraphs = doc.select("p");
- }
- @Test
- public void test117() {
- // String path = "F:\\file\\测试\\word对比\\S2418519-测试卷-PACN2414802-一种用于降低GIDL的混合型Gate结构及制备-权利要求书-v1r01-sq.docx";
- // String path1 = "F:\\file\\测试\\word对比\\240722-S2418519-测试卷-PACN2414802-一种用于降低GIDL的混合型Gate结构及制备-申请全文-v1F.docx";
- // String path = "F:\\file\\测试\\word对比\\240722-S2418519-测试卷-PACN2414802-一种用于降低GIDL的混合型Gate结构及制备-申请全文-v1F.docx";
- // String path1 = "F:\\file\\测试\\word对比\\240805-S2418519-测试卷-PACN2414802-一种用于降低GIDL的混合型Gate结构及制备-申请全文-v3F-清洁版.docx";
- String path = "F:\\file\\测试\\word对比\\AAA.docx";
- String path1 = "F:\\file\\测试\\word对比\\BBBB.docx";
- if (!path.endsWith(".doc") && !path.endsWith(".docx")) {
- throw new XiaoShiException("请上传Word文件");
- }
- if (!path1.endsWith(".doc") && !path1.endsWith(".docx")) {
- throw new XiaoShiException("请上传Word文件");
- }
- try {
- String text = "";
- String text1 = "";
- if (path.endsWith(".docx")) {
- XWPFDocument document = new XWPFDocument(new FileInputStream(path));
- XWPFWordExtractor extractor = new XWPFWordExtractor(document);
- text = extractor.getText();
- // System.out.println(text);
- document.close();
- } else if (path.endsWith(".doc")) {
- InputStream inputStream = new FileInputStream(path);
- WordExtractor wordExtractor = new WordExtractor(inputStream);
- text = wordExtractor.getText();
- }
- if (path1.endsWith(".docx")) {
- XWPFDocument document1 = new XWPFDocument(new FileInputStream(path1));
- // List<XWPFParagraph> xwpfParagraphs= document1.getParagraphs();
- // xwpfParagraphs.forEach(item->{
- // System.out.println(item.getText());
- // });
- XWPFWordExtractor extractor1 = new XWPFWordExtractor(document1);
- text1 = extractor1.getText();
- // System.out.println(text1);
- // 关闭流
- document1.close();
- } else if (path1.endsWith(".doc")) {
- InputStream inputStream = new FileInputStream(path1);
- WordExtractor wordExtractor = new WordExtractor(inputStream);
- text1 = wordExtractor.getText();
- }
- compareText(text, text1);
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
- public static void compareText(String text, String text1) {
- // commons-text
- StringsComparator comparator = new StringsComparator(text, text1);
- EditScript<Character> script = comparator.getScript();
- SectionDiffCommandVisitor commandVisitor = new SectionDiffCommandVisitor();
- script.visit(commandVisitor);
- commandVisitor.finish();
- // System.out.println(commandVisitor.getLeftTemp());
- // System.out.println(commandVisitor.getRightTemp());
- String firstDoc = commandVisitor.getLeftTemp().toString();
- int firstDocLen = firstDoc.length();
- System.out.println("firstDoc:" + firstDocLen);
- String secondDoc = commandVisitor.getRightTemp().toString();
- String secondDocReplace = secondDoc.replace("\n", "<br>");
- String html = "<p>" + secondDocReplace + "</p>";
- // System.out.println(html);
- // 使用Jsoup解析HTML
- Document doc = Jsoup.parse(html);
- Elements paragraphs = doc.select("p");
- // 创建Word文档
- XWPFDocument wordDocument = new XWPFDocument();
- // 遍历每个段落
- for (Element paragraph : paragraphs) {
- XWPFParagraph wordParagraph = wordDocument.createParagraph();
- XWPFRun run = wordParagraph.createRun();
- int emNum = 0;
- int delNum = 0;
- int total = 0;
- // 遍历段落中的每个节点
- for (Node node : paragraph.childNodes()) {
- if (node instanceof TextNode) {
- try {
- boolean b = run.getText(0).isEmpty();
- // System.out.println(run.getText(0));
- if (!b) {
- if (org.apache.commons.lang3.StringUtils.isNotEmpty(run.getText(0))) {
- run = wordParagraph.createRun();
- }
- }
- } catch (Exception e) {
- }
- // 处理纯文本节点
- run.setText(((TextNode) node).text());
- total += ((TextNode) node).text().length();
- } else if (node instanceof Element) {
- // 处理HTML元素
- Element element = (Element) node;
- if ("em".equalsIgnoreCase(element.tagName())) {
- String s = element.text();
- // 应用斜体样式
- run = wordParagraph.createRun();
- run.setText(s);
- emNum += s.length();
- run.setColor("FF0000");
- run.setBold(true);
- run.setItalic(true);
- } else if ("del".equalsIgnoreCase(element.tagName())) {
- String s = element.text();
- // 应用删除线样式
- run = wordParagraph.createRun();
- run.setText(s);
- delNum += s.length();
- run.setColor("0000FF");
- run.setBold(true);
- run.setStrike(true);
- } else if ("br".equalsIgnoreCase(element.tagName())) {
- run = wordParagraph.createRun();
- run.addBreak();
- } else {
- // 递归处理其他标签(如果有)
- for (Node childNode : element.childNodes()) {
- processNode(wordParagraph, childNode);
- }
- }
- }
- }
- System.out.println("Em:" + emNum);
- System.out.println("Del:" + delNum);
- System.out.println("Total:" + total);
- int sum = emNum + delNum;
- final BigDecimal sumBig = new BigDecimal(sum);
- final BigDecimal totalBig = new BigDecimal(total);
- BigDecimal diff = sumBig.divide(totalBig, 2, RoundingMode.HALF_UP)
- .multiply(new BigDecimal(100));
- System.out.println("Diff:" + diff + "%");
- }
- // 将文档写入文件
- try (FileOutputStream out = new FileOutputStream("F:\\file\\测试\\word对比\\CCC.docx")) {
- wordDocument.write(out);
- } catch (IOException e) {
- e.printStackTrace();
- }
- // 关闭文档
- try {
- wordDocument.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- public void addDiff(String caseNo) throws IOException {
- CaseFile caseFile = caseFileMapper.selectOne(new LambdaQueryWrapper<CaseFile>()
- .eq(CaseFile::getCaseNo, caseNo));
- if (ObjectUtils.isNotEmpty(caseFile)) {
- List<AssoCaseFile> assoCaseFiles = assoCaseFileMapper.selectList(new LambdaQueryWrapper<AssoCaseFile>()
- .eq(AssoCaseFile::getCaseId, caseFile.getId()));
- AssoCaseFile caseFile1 = assoCaseFiles.stream().filter(i -> i.getFileType() == 1).findFirst().orElse(new AssoCaseFile());
- AssoCaseFile caseFile2 = assoCaseFiles.stream().filter(i -> i.getFileType() == 3).findFirst().orElse(new AssoCaseFile());
- AssoCaseFile caseFile3 = assoCaseFiles.stream().filter(i -> i.getFileType() == 3).findFirst().orElse(new AssoCaseFile());
- if (ObjectUtils.isNotEmpty(caseFile1) && ObjectUtils.isNotEmpty(caseFile2)) {
- Discrepancy discrepancy = new Discrepancy();
- discrepancy.setCaseId(caseFile.getId());
- discrepancy.setCaseFileId1(caseFile1.getId());
- discrepancy.setCaseFileId2(caseFile2.getId());
- discrepancy.setDiscrepancyType(1);
- discrepancy.insert();
- this.getFile(caseFile1.getFileGuid(), caseFile2.getFileGuid(), discrepancy.getId());
- }
- if (ObjectUtils.isNotEmpty(caseFile2) && ObjectUtils.isNotEmpty(caseFile3)) {
- Discrepancy discrepancy = new Discrepancy();
- discrepancy.setCaseId(caseFile.getId());
- discrepancy.setCaseFileId1(caseFile2.getId());
- discrepancy.setCaseFileId2(caseFile3.getId());
- discrepancy.setDiscrepancyType(2);
- discrepancy.insert();
- }
- }
- }
- public void getFile(String guid1, String guid2, Integer discrepancyId) throws IOException {
- byte[] bytes = fileManagerService.downloadSystemFileFromFMS(guid1);
- File tempFile = File.createTempFile("temp1_", ".docx");
- try (
- InputStream inputStream = new ByteArrayInputStream(bytes);
- FileOutputStream outputStream = new FileOutputStream(tempFile)
- ) {
- IOUtils.copy(inputStream, outputStream);
- }
- byte[] bytes1 = fileManagerService.downloadSystemFileFromFMS(guid2);
- File tempFile1 = File.createTempFile("temp2_", ".docx");
- try (
- InputStream inputStream = new ByteArrayInputStream(bytes1);
- FileOutputStream outputStream = new FileOutputStream(tempFile1)
- ) {
- IOUtils.copy(inputStream, outputStream);
- }
- XWPFDocument document = new XWPFDocument(new FileInputStream(tempFile));
- tempFile.delete();
- tempFile1.delete();
- }
- //最终方法
- @Test
- public void test118() {
- // String path = "F:\\file\\测试\\word对比\\240722-S2418519-测试卷-PACN2414802-一种用于降低GIDL的混合型Gate结构及制备-申请全文-v1F.docx";
- String path = "F:\\file\\测试\\word对比\\S2418519-测试卷-PACN2414802-一种用于降低GIDL的混合型Gate结构及制备-权利要求书-v1r01-sq.docx";
- // String path = "F:\\file\\测试\\word对比\\AAA.docx";
- // String path1 = "F:\\file\\测试\\word对比\\240805-S2418519-测试卷-PACN2414802-一种用于降低GIDL的混合型Gate结构及制备-申请全文-v3F-清洁版.docx";
- String path1 = "F:\\file\\测试\\word对比\\240722-S2418519-测试卷-PACN2414802-一种用于降低GIDL的混合型Gate结构及制备-申请全文-v1F.docx";
- try {
- // String text = "";
- // String text1 = "";
- List<String> list = new ArrayList<>();
- List<String> list1 = new ArrayList<>();
- if (path.endsWith(".docx")) {
- XWPFDocument document = new XWPFDocument(new FileInputStream(path));
- List<XWPFParagraph> paragraphs = document.getParagraphs();
- for (XWPFParagraph paragraph : paragraphs) {
- final String s = paragraph.getText().trim();
- if (StringUtils.isNotEmpty(s)) {
- list.add(s);
- }
- }
- // XWPFWordExtractor extractor = new XWPFWordExtractor(document);
- // text = extractor.getText();
- document.close();
- } else if (path.endsWith(".doc")) {
- InputStream inputStream = new FileInputStream(path);
- WordExtractor wordExtractor = new WordExtractor(inputStream);
- String[] paragraphText = wordExtractor.getParagraphText();
- for (String s : paragraphText) {
- String trim = s.trim();
- if (StringUtils.isNotEmpty(trim.trim())) {
- list.add(trim);
- }
- }
- // list = Arrays.asList(paragraphText);
- // text = wordExtractor.getText();
- }
- if (path1.endsWith(".docx")) {
- XWPFDocument document1 = new XWPFDocument(new FileInputStream(path1));
- List<XWPFParagraph> paragraphs = document1.getParagraphs();
- for (XWPFParagraph paragraph : paragraphs) {
- final String s = paragraph.getText().trim();
- if (StringUtils.isNotEmpty(s)) {
- list1.add(s);
- }
- }
- // XWPFWordExtractor extractor1 = new XWPFWordExtractor(document1);
- // text1 = extractor1.getText();
- // 关闭流
- document1.close();
- } else if (path1.endsWith(".doc")) {
- InputStream inputStream = new FileInputStream(path1);
- WordExtractor wordExtractor = new WordExtractor(inputStream);
- String[] paragraphText = wordExtractor.getParagraphText();
- list1 = Arrays.asList(paragraphText);
- // text1 = wordExtractor.getText();
- }
- Map<String, String> firstMap = this.getText(list);
- Map<String, String> secondMap = this.getText(list1);
- final String text = StringUtils.join(list, "\n");
- final String text1 = StringUtils.join(list1, "\n");
- System.out.println("text:" + text);
- System.out.println("text1:" + text1);
- compareText1(text, text1, 1);
- compareText1(firstMap.get("abstract"), secondMap.get("abstract"), 2);
- compareText1(firstMap.get("claims"), secondMap.get("claims"), 3);
- compareText1(firstMap.get("description"), secondMap.get("description"), 4);
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
- public Map<String, String> getText(List<String> list) {
- List<String> abstractSection = new ArrayList<>();
- List<String> claimsSection = new ArrayList<>();
- List<String> descriptionSection = new ArrayList<>();
- int abstractIndex = list.indexOf("说 明 书 摘 要");
- int claimsIndex = list.indexOf("权 利 要 求 书");
- int descriptionIndex = list.indexOf("说 明 书");
- if (abstractIndex >= 0) {
- int end = (claimsIndex > 0 && claimsIndex > abstractIndex) ? claimsIndex : list.size();
- abstractSection = list.subList(abstractIndex, end);
- }
- if (claimsIndex >= 0) {
- int end = (descriptionIndex > 0 && descriptionIndex > claimsIndex) ? descriptionIndex : list.size();
- claimsSection = list.subList(claimsIndex, end);
- }
- if (descriptionIndex >= 0) {
- descriptionSection = list.subList(descriptionIndex, list.size());
- }
- String abstractStr = StringUtils.join(abstractSection, "\n");
- String claimsStr = StringUtils.join(claimsSection, "\n");
- String descriptionStr = StringUtils.join(descriptionSection, "\n");
- Map<String, String> map = new HashMap<>();
- map.put("abstract", abstractStr);
- map.put("claims", claimsStr);
- map.put("description", descriptionStr);
- return map;
- }
- public void compareText1(String text, String text1, Integer type) {
- double similarity = cosineSimilarityService.calculateCosineSimilarity(text, text1);
- int total = text.length();
- System.out.println("firstDoc:" + total);
- StringsComparator comparator = new StringsComparator(text, text1);
- EditScript<Character> script = comparator.getScript();
- SectionDiffCommandVisitor commandVisitor = new SectionDiffCommandVisitor();
- script.visit(commandVisitor);
- commandVisitor.finish();
- String secondDoc = commandVisitor.getRightTemp().toString();
- String secondDocReplace = secondDoc.replaceAll("(\\r\\n|\\n)", "<br>");
- String html = "<p>" + secondDocReplace + "</p>";
- // 使用Jsoup解析HTML
- Document doc = Jsoup.parse(html);
- Elements paragraphs = doc.select("p");
- String html3 = doc.html();
- // String secondDocReplace1 = html3.replaceAll("(\\r\\n|\\n)", "<br>");
- System.out.println(html3);
- // 创建Word文档
- XWPFDocument wordDocument = new XWPFDocument();
- // 遍历每个段落
- for (Element paragraph : paragraphs) {
- List<String> emList = new ArrayList<>();
- List<String> delList = new ArrayList<>();
- // 遍历段落中的每个节点
- for (Node node : paragraph.childNodes()) {
- if (node instanceof Element element) {
- // 处理HTML元素
- if ("em".equalsIgnoreCase(element.tagName())) {
- String s = element.text();
- if (StringUtils.isNotEmpty(s)) {
- emList.add(s);
- }
- } else if ("del".equalsIgnoreCase(element.tagName())) {
- String s = element.text();
- if (StringUtils.isNotEmpty(s)) {
- delList.add(s);
- }
- }
- }
- }
- //修改处数
- int emNum = 0;
- //修改字数
- int emSum = 0;
- //删除处数
- int delNum = 0;
- //删除字数
- int delSum = 0;
- if (!CollectionUtils.isEmpty(emList)) {
- emSum = emList.stream().distinct().mapToInt(String::length).sum();
- emNum = (int) emList.stream().distinct().count();
- }
- if (!CollectionUtils.isEmpty(delList)) {
- delSum = delList.stream().distinct().mapToInt(String::length).sum();
- delNum = (int) delList.stream().distinct().count();
- }
- System.out.println("Em:" + emNum);
- System.out.println("Del:" + delNum);
- System.out.println("Total:" + total);
- //修改总处数
- int editSum = emNum + delNum;
- int editWordNum = emSum + delSum;
- BigDecimal sumBig = new BigDecimal(editWordNum);
- BigDecimal diff = new BigDecimal(0);
- if (total != 0) {
- BigDecimal totalBig = new BigDecimal(total);
- diff = sumBig.divide(totalBig, 6, RoundingMode.HALF_UP);
- } else {
- diff = new BigDecimal(1);
- similarity = 0L;
- }
- System.out.println("Diff:" + diff);
- System.out.println("similarity:" + String.format("%.6f", similarity));
- System.out.println("AAAAAAAAAAAAAA");
- // DiscrepancyDetail detail = new DiscrepancyDetail();
- // detail.setDiscrepancyId(0);
- // detail.setTotalWorldCount(total);
- // detail.setEditCount(editSum);
- // detail.setEditWorldCount(editWordNum);
- // detail.setDelWorldCount(delSum);
- // detail.setDiffType(type);
- // detail.setDiffContent(secondDocReplace);
- // detail.setRate(diff.toString());
- // detail.setSimilarity(String.format("%.4f", similarity));
- // detail.insert();
- }
- // 关闭文档
- try {
- wordDocument.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- @Test
- public void test119() {
- // String path = "F:\\file\\测试\\word对比\\AAA-copy.doc";
- String path = "F:\\file\\测试\\word对比\\AAA.docx";
- String path1 = "F:\\file\\测试\\word对比\\BBBB.docx";
- try {
- String text = "";
- String text1 = "";
- if (path.endsWith(".docx")) {
- XWPFDocument document = new XWPFDocument(new FileInputStream(path));
- XWPFWordExtractor extractor = new XWPFWordExtractor(document);
- text = extractor.getText();
- document.close();
- } else if (path.endsWith(".doc")) {
- InputStream inputStream = new FileInputStream(path);
- WordExtractor wordExtractor = new WordExtractor(inputStream);
- final String[] paragraphText = wordExtractor.getParagraphText();
- text = wordExtractor.getText();
- }
- if (path1.endsWith(".docx")) {
- XWPFDocument document1 = new XWPFDocument(new FileInputStream(path1));
- XWPFWordExtractor extractor1 = new XWPFWordExtractor(document1);
- text1 = extractor1.getText();
- // 关闭流
- document1.close();
- } else if (path1.endsWith(".doc")) {
- InputStream inputStream = new FileInputStream(path1);
- WordExtractor wordExtractor = new WordExtractor(inputStream);
- text1 = wordExtractor.getText();
- }
- compareText2(text, text1);
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
- public void compareText2(String text, String text1) {
- double su = cosineSimilarityService.calculateCosineSimilarity(text1, text);
- int total = text.length();
- int total1 = text1.length();
- System.out.println("firstDoc:" + total);
- // commons-text
- StringsComparator comparator = new StringsComparator(text, text1);
- EditScript<Character> script = comparator.getScript();
- SectionDiffCommandVisitor commandVisitor = new SectionDiffCommandVisitor();
- script.visit(commandVisitor);
- commandVisitor.finish();
- // System.out.println(commandVisitor.getLeftTemp());
- // System.out.println(commandVisitor.getRightTemp());
- String secondDoc = commandVisitor.getRightTemp().toString();
- String html = "<p>" + secondDoc + "</p>";
- // System.out.println(html);
- // 使用Jsoup解析HTML
- Document doc = Jsoup.parse(html);
- Elements paragraphs = doc.select("p");
- String html3 = doc.html();
- String secondDocReplace = html3.replace("(\r\n|\n)", "<br>");
- System.out.println(secondDocReplace);
- // 创建Word文档
- XWPFDocument wordDocument = new XWPFDocument();
- // 遍历每个段落
- for (Element paragraph : paragraphs) {
- List<String> emList = new ArrayList<>();
- List<String> delList = new ArrayList<>();
- // 遍历段落中的每个节点
- for (Node node : paragraph.childNodes()) {
- if (node instanceof Element element) {
- // 处理HTML元素
- if ("em".equalsIgnoreCase(element.tagName())) {
- String s = element.text();
- emList.add(s);
- } else if ("del".equalsIgnoreCase(element.tagName())) {
- String s = element.text();
- delList.add(s);
- }
- }
- }
- //修改处数
- int emNum = 0;
- //修改字数
- int emSum = 0;
- //删除处数
- int delNum = 0;
- //删除字数
- int delSum = 0;
- if (!CollectionUtils.isEmpty(emList)) {
- emSum = emList.stream().mapToInt(String::length).sum();
- emNum = (int) emList.stream().distinct().count();
- }
- if (!CollectionUtils.isEmpty(delList)) {
- delSum = delList.stream().mapToInt(String::length).sum();
- delNum = (int) delList.stream().distinct().count();
- }
- System.out.println("Em:" + emNum);
- System.out.println("Del:" + delNum);
- System.out.println("Total:" + total);
- //修改总处数
- int editSum = emNum + delNum;
- int editWordNum = emSum + delSum;
- BigDecimal sumBig = new BigDecimal(editWordNum);
- BigDecimal totalBig = new BigDecimal(total);
- BigDecimal diff = sumBig.divide(totalBig, 4, RoundingMode.HALF_UP);
- System.out.println("Diff:" + diff);
- System.out.println("similar:" + String.format("%.4f", su));
- }
- // 关闭文档
- try {
- wordDocument.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- @Test
- public void test120() throws IOException {
- // String path = "F:\\file\\测试\\word对比\\AAA-copy.doc";
- // String path = "F:\\file\\测试\\word对比\\AAA.docx";
- // String path1 = "F:\\file\\测试\\word对比\\BBBB.docx";
- String path = "F:\\file\\测试\\word对比\\S2418519-测试卷-PACN2414802-一种用于降低GIDL的混合型Gate结构及制备-权利要求书-v1r01-sq.docx";
- String path1 = "F:\\file\\测试\\word对比\\240722-S2418519-测试卷-PACN2414802-一种用于降低GIDL的混合型Gate结构及制备-申请全文-v1F.docx";
- List<String> list = new ArrayList<>();
- List<String> list1 = new ArrayList<>();
- try {
- String text = "";
- String text1 = "";
- if (path.endsWith(".docx")) {
- XWPFDocument document = new XWPFDocument(new FileInputStream(path));
- List<XWPFParagraph> paragraphs = document.getParagraphs();
- for (XWPFParagraph paragraph : paragraphs) {
- final String s = paragraph.getText().trim();
- if (StringUtils.isNotEmpty(s)) {
- list.add(s);
- }
- }
- XWPFWordExtractor extractor = new XWPFWordExtractor(document);
- text = extractor.getText();
- document.close();
- } else if (path.endsWith(".doc")) {
- InputStream inputStream = new FileInputStream(path);
- WordExtractor wordExtractor = new WordExtractor(inputStream);
- String[] paragraphText = wordExtractor.getParagraphText();
- for (String s : paragraphText) {
- if (StringUtils.isNotEmpty(s)) {
- list.add(s);
- }
- }
- list = Arrays.asList(paragraphText);
- text = wordExtractor.getText();
- }
- if (path1.endsWith(".docx")) {
- XWPFDocument document1 = new XWPFDocument(new FileInputStream(path1));
- List<XWPFParagraph> paragraphs = document1.getParagraphs();
- for (XWPFParagraph paragraph : paragraphs) {
- String s = paragraph.getText().trim();
- if (StringUtils.isNotEmpty(s)) {
- list1.add(s);
- }
- }
- XWPFWordExtractor extractor1 = new XWPFWordExtractor(document1);
- text1 = extractor1.getText();
- // 关闭流
- document1.close();
- } else if (path1.endsWith(".doc")) {
- InputStream inputStream = new FileInputStream(path1);
- WordExtractor wordExtractor = new WordExtractor(inputStream);
- text1 = wordExtractor.getText();
- }
- // final double cscwv = cosineSimilarityService.calculateAverageCSCWV(list, list1);
- // final double su = cosineSimilarityService.calculateCosineSimilarity(text1, text);
- // final double su1 = cosineSimilarityService.calculateCosineSimilarity(text, text);
- // System.out.println(su);
- Map<String, String> firstMap = this.getText(list);
- Map<String, String> secondMap = this.getText(list1);
- System.out.println(firstMap);
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
- }
|