123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322 |
- using System;
- using System.Collections.Generic;
- using System.IO;
- using System.Text;
- using DiffPlex;
- using System.IO.Packaging;
- using System.Text.RegularExpressions;
- using System.Linq;
- using Microsoft.Office.Interop.Word;
- using wispro.sp.entity.CompareCase;
- namespace wispro.sp.utility
- {
- /// <summary>
- /// 比较两个Docx文档中文字的不同
- /// </summary>
- public class CompareDocx
- {
- public class PatentDocument
- {
- public string FilePath { get; set; }
- public string Abstract { get; set; }
- public string Claim { get; set; }
- public string FullText { get; set; }
- public string DocumentString { get; set; }
- public PatentDocument(string filePath) {
- this.FilePath = filePath;
- if (!System.IO.File.Exists(this.FilePath) )
- {
- throw new ApplicationException("指定的文件不存在!");
- }
-
- if (this.FilePath.EndsWith(".doc"))
- {
- DocumentString = GetDocTxt(this.FilePath);
- }
- else
- {
- if (this.FilePath.ToLower().EndsWith(".docx"))
- {
- DocumentString = GetDocxTxt(this.FilePath);
- }
- }
- }
- private string GetDocTxt(string filePath)
- {
- Application word = null;
- Document doc = null;
- string content = string.Empty;
- try
- {
- // 创建Word应用实例
- word = new Application();
- // 打开Word文档
- System.IO.FileInfo fileInfo = new System.IO.FileInfo(filePath);
- doc = word.Documents.Open(fileInfo.FullName);
- // 读取文档内容
- content = doc.Content.Text;
- List<string> lines = content.Split(new string[] { "\f", "\r" }, StringSplitOptions.None).ToList();
- return List2String(lines);
- }
- catch (Exception ex)
- {
- throw new Exception($"读取Word文档时发生错误: {ex.Message}");
- }
- finally
- {
- // 关闭文档
- if (doc != null)
- {
- doc.Close();
- #pragma warning disable CA1416 // 验证平台兼容性
- System.Runtime.InteropServices.Marshal.ReleaseComObject(doc);
- #pragma warning restore CA1416 // 验证平台兼容性
- }
- // 退出Word应用
- if (word != null)
- {
- word.Quit();
- #pragma warning disable CA1416 // 验证平台兼容性
- System.Runtime.InteropServices.Marshal.ReleaseComObject(word);
- #pragma warning restore CA1416 // 验证平台兼容性
- }
- }
- }
- private string GetDocxTxt(string filepath)
- {
- var oldtext = getDocxMainXml(filepath);
- var oldlines = ExtractWPTextFromXml(oldtext);
- oldtext = List2String(oldlines);
- return oldtext;
- }
- private string List2String(List<string> lines)
- {
- string[] array = { "权利要求书", "说明书摘要", "说明书" ,"背景技术","发明内容", "技术领域", "具体实施方式", "摘要附图", "说明书附图" };
- StringBuilder sb = new StringBuilder();
- string lastBlock = string.Empty;
- foreach (var line in lines)
- {
- if (!string.IsNullOrEmpty(line))
- {
- sb.Append(line.Trim() + "\r\n");
-
- if(Array.Exists(array, element => element == line.Replace("\u0001","").Replace(" ","").Trim()))
- {
- lastBlock = line.Replace("\u0001", "").Replace(" ", "").Trim();
- }
- else
- {
- switch(lastBlock) {
- case "权利要求书":
- this.Claim = this.Claim + "\r\n" + line;
- break;
- case "说明书摘要":
- this.Abstract = this.Abstract + "\r\n" + line;
- break;
- case "说明书":
- case "背景技术":
- case "发明内容":
- case "技术领域":
- case "具体实施方式":
- this.FullText = this.FullText + "\r\n" + line;
- break;
- }
- }
- }
- }
- this.Abstract = string.IsNullOrEmpty(this.Abstract)?string.Empty: this.Abstract.Trim();
- this.Claim = string.IsNullOrEmpty(this.Claim) ? string.Empty : this.Claim.Trim();
- this.FullText = string.IsNullOrEmpty(this.FullText) ? string.Empty : this.FullText.Trim();
- return sb.ToString();
- }
- private string getDocxMainXml(string filePath)
- {
- string text = string.Empty;
- using (Package package = Package.Open(filePath, FileMode.Open))
- {
- var Parts = package.GetParts();
- foreach (var part in Parts)
- {
- if (part.ContentType.StartsWith("application/vnd.openxmlformats-officedocument.wordprocessingml.document.main"))
- {
- using (Stream stream = part.GetStream())
- {
- StreamReader reader = new StreamReader(stream);
- text = reader.ReadToEnd();
- break;
- }
- }
- }
- return text;
- }
- }
- private List<string> ExtractWPTextFromXml(string xmlText)
- {
- List<string> lines = new List<string>();
- // 使用正则表达式匹配 <w:t> 标签的内容
- MatchCollection matches = Regex.Matches(xmlText, "(<w:p\\s.*?>|<w:p>)(.*?)</w:p>");
- foreach (Match match in matches)
- {
- lines.Add(ExtractWtTextFromXml(match.Groups[2].Value));
- }
- return lines;
- }
- private string ExtractWtTextFromXml(string xmlText)
- {
- // 使用正则表达式匹配 <w:t> 标签的内容
- MatchCollection matches = Regex.Matches(xmlText, "(<w:t\\s.*?>|<w:t>)(.*?)</w:t>");
- StringBuilder sb = new StringBuilder();
- foreach (Match match in matches)
- {
- sb.Append(match.Groups[2].Value);
- }
- return sb.ToString();
- }
- }
- /// <summary>
- /// 原文档路径
- /// </summary>
- public PatentDocument oldDocument { get; set; }
- /// <summary>
- /// 修订后文档路径
- /// </summary>
- public PatentDocument newDocument { get; set; }
- /// <summary>
- /// 权力要求比较结果
- /// </summary>
- public CompareResult ClaimResult { get; set; }
- /// <summary>
- /// 摘要比较结果
- /// </summary>
- public CompareResult AbstractResult { get; set; }
- /// <summary>
- /// 说明书比较结果
- /// </summary>
- public CompareResult FulltextResult { get; set; }
- /// <summary>
- /// 所有文字比较结果
- /// </summary>
- public CompareResult AllStringResult { get; set; }
- /// <summary>
- /// 比较两个文档
- /// </summary>
- /// <param name="oldFile"></param>
- /// <param name="newFile"></param>
- public void Compare(string oldFile, string newFile)
- {
- this.oldDocument =new PatentDocument(oldFile);
- this.newDocument =new PatentDocument(newFile);
- this.ClaimResult = StringCompare(this.oldDocument.Claim,this.newDocument.Claim);
- this.AbstractResult = StringCompare(this.oldDocument.Abstract, this.newDocument.Abstract);
- this.FulltextResult = StringCompare(this.oldDocument.FullText, this.newDocument.FullText);
- this.AllStringResult = StringCompare(this.oldDocument.DocumentString, this.newDocument.DocumentString);
- }
- /// <summary>
- /// 比较两个文档
- /// </summary>
- /// <exception cref="ApplicationException"></exception>
- public CompareResult StringCompare(string oldtext,string newtext)
- {
- CompareResult result = new CompareResult();
- var differ = new Differ();
- if(oldtext == null) { oldtext = ""; }
- if(newtext == null) { newtext = ""; }
- result.oldWordCount = oldtext.Length;
- result.newWordCount = newtext.Length;
-
- var diff = differ.CreateCharacterDiffs(oldtext, newtext, true);
- //result.EditCount = diff.DiffBlocks.Count;
- int lastPos = 0;
- string _CompareResultString = "";
- string lastResult = "";
- List<string> ModifyList = new List<string>();
- foreach (var change in diff.DiffBlocks)
- {
- string strModifyStr = "";
-
- lastResult += oldtext.Substring(lastPos, change.DeleteStartA - lastPos);
- _CompareResultString += oldtext.Substring(lastPos, change.DeleteStartA - lastPos);
- lastPos = change.DeleteStartA + change.DeleteCountA;
- if (change.DeleteCountA > 0)
- {
- strModifyStr += $"<strike style=\"text-decoration: line-through; color: red;\">{oldtext.Substring(change.DeleteStartA, change.DeleteCountA)}</strike>";
- _CompareResultString += $"<strike style=\"text-decoration: line-through; color: red;\">{oldtext.Substring(change.DeleteStartA, change.DeleteCountA)}</strike>";
- }
- if (change.InsertCountB > 0)
- {
- strModifyStr += $"<u style=\"text-decoration: underline; color: blue;\">{newtext.Substring(change.InsertStartB, change.InsertCountB)}</u>";
- _CompareResultString += $"<u style=\"text-decoration: underline; color: blue;\">{newtext.Substring(change.InsertStartB, change.InsertCountB)}</u>";
- lastResult += newtext.Substring(change.InsertStartB, change.InsertCountB);
- }
- if(!ModifyList.Contains(strModifyStr))
- {
- ModifyList.Add(strModifyStr);
- result.DeleteCount += change.DeleteCountA;
- result.InsertCount += change.InsertCountB;
- result.EditCount += 1;
- }
-
- }
- lastResult += oldtext.Substring(lastPos);
- _CompareResultString += oldtext.Substring(lastPos);
- _CompareResultString = _CompareResultString.Replace("\r\n", "<br/>");
- result.CompareResultString = _CompareResultString;
- result.TextSimilarity = CosineSimilarity.Calculate(oldtext, newtext);
- return result;
- }
-
- }
- }
|