using System; using System.Collections.Generic; using System.IO; using System.Text; using DiffPlex; using System.IO.Packaging; using System.Text.RegularExpressions; using System.Linq; using Microsoft.Office.Interop.Word; using wispro.sp.entity.CompareCase; namespace wispro.sp.utility { /// /// 比较两个Docx文档中文字的不同 /// public class CompareDocx { public class PatentDocument { public string FilePath { get; set; } public string Abstract { get; set; } public string Claim { get; set; } public string FullText { get; set; } public string DocumentString { get; set; } public PatentDocument(string filePath) { this.FilePath = filePath; if (!System.IO.File.Exists(this.FilePath) ) { throw new ApplicationException("指定的文件不存在!"); } if (this.FilePath.EndsWith(".doc")) { DocumentString = GetDocTxt(this.FilePath); } else { if (this.FilePath.ToLower().EndsWith(".docx")) { DocumentString = GetDocxTxt(this.FilePath); } } } private string GetDocTxt(string filePath) { Application word = null; Document doc = null; string content = string.Empty; try { // 创建Word应用实例 word = new Application(); // 打开Word文档 System.IO.FileInfo fileInfo = new System.IO.FileInfo(filePath); if (fileInfo.Name.Contains("-保密-")) { return null; } else { doc = word.Documents.Open(fileInfo.FullName); // 读取文档内容 content = doc.Content.Text; List lines = content.Split(new string[] { "\f", "\r" }, StringSplitOptions.None).ToList(); return List2String(lines); } } catch (Exception ex) { throw new Exception($"读取Word文档时发生错误: {ex.Message}"); } finally { // 关闭文档 if (doc != null) { doc.Close(); #pragma warning disable CA1416 // 验证平台兼容性 System.Runtime.InteropServices.Marshal.ReleaseComObject(doc); #pragma warning restore CA1416 // 验证平台兼容性 } // 退出Word应用 if (word != null) { word.Quit(); #pragma warning disable CA1416 // 验证平台兼容性 System.Runtime.InteropServices.Marshal.ReleaseComObject(word); #pragma warning restore CA1416 // 验证平台兼容性 } } } private string GetDocxTxt(string filepath) { var oldtext = getDocxMainXml(filepath); var oldlines = ExtractWPTextFromXml(oldtext); oldtext = List2String(oldlines); return oldtext; } private string List2String(List lines) { string[] array = { "权利要求书", "说明书摘要", "说明书" ,"背景技术","发明内容", "技术领域", "具体实施方式", "摘要附图", "说明书附图" }; StringBuilder sb = new StringBuilder(); string lastBlock = string.Empty; foreach (var line in lines) { if (!string.IsNullOrEmpty(line)) { sb.Append(line.Trim() + "\r\n"); if(Array.Exists(array, element => element == line.Replace("\u0001","").Replace(" ","").Trim())) { lastBlock = line.Replace("\u0001", "").Replace(" ", "").Trim(); } else { switch(lastBlock) { case "权利要求书": this.Claim = this.Claim + "\r\n" + line; break; case "说明书摘要": this.Abstract = this.Abstract + "\r\n" + line; break; case "说明书": case "背景技术": case "发明内容": case "技术领域": case "具体实施方式": this.FullText = this.FullText + "\r\n" + line; break; } } } } this.Abstract = string.IsNullOrEmpty(this.Abstract)?string.Empty: this.Abstract.Trim(); this.Claim = string.IsNullOrEmpty(this.Claim) ? string.Empty : this.Claim.Trim(); this.FullText = string.IsNullOrEmpty(this.FullText) ? string.Empty : this.FullText.Trim(); return sb.ToString(); } private string getDocxMainXml(string filePath) { string text = string.Empty; using (Package package = Package.Open(filePath, FileMode.Open)) { var Parts = package.GetParts(); foreach (var part in Parts) { if (part.ContentType.StartsWith("application/vnd.openxmlformats-officedocument.wordprocessingml.document.main")) { using (Stream stream = part.GetStream()) { StreamReader reader = new StreamReader(stream); text = reader.ReadToEnd(); break; } } } return text; } } private List ExtractWPTextFromXml(string xmlText) { List lines = new List(); // 使用正则表达式匹配 标签的内容 MatchCollection matches = Regex.Matches(xmlText, "(|)(.*?)"); foreach (Match match in matches) { lines.Add(ExtractWtTextFromXml(match.Groups[2].Value)); } return lines; } private string ExtractWtTextFromXml(string xmlText) { // 使用正则表达式匹配 标签的内容 MatchCollection matches = Regex.Matches(xmlText, "(|)(.*?)"); StringBuilder sb = new StringBuilder(); foreach (Match match in matches) { sb.Append(match.Groups[2].Value); } return sb.ToString(); } } /// /// 原文档路径 /// public PatentDocument oldDocument { get; set; } /// /// 修订后文档路径 /// public PatentDocument newDocument { get; set; } /// /// 权力要求比较结果 /// public CompareResult ClaimResult { get; set; } /// /// 摘要比较结果 /// public CompareResult AbstractResult { get; set; } /// /// 说明书比较结果 /// public CompareResult FulltextResult { get; set; } /// /// 所有文字比较结果 /// public CompareResult AllStringResult { get; set; } /// /// 比较两个文档 /// /// /// public void Compare(string oldFile, string newFile) { this.oldDocument =new PatentDocument(oldFile); this.newDocument =new PatentDocument(newFile); if (!string.IsNullOrEmpty(this.oldDocument.Claim) && !string.IsNullOrEmpty(this.newDocument.Claim)) { this.ClaimResult = StringCompare(this.oldDocument.Claim, this.newDocument.Claim); } if (!string.IsNullOrEmpty(this.oldDocument.Abstract) && !string.IsNullOrEmpty(this.newDocument.Abstract)) this.AbstractResult = StringCompare(this.oldDocument.Abstract, this.newDocument.Abstract); if (!string.IsNullOrEmpty(this.oldDocument.FullText) && !string.IsNullOrEmpty(this.newDocument.FullText)) this.FulltextResult = StringCompare(this.oldDocument.FullText, this.newDocument.FullText); if (!string.IsNullOrEmpty(this.oldDocument.DocumentString) && !string.IsNullOrEmpty(this.newDocument.DocumentString)) this.AllStringResult = StringCompare(this.oldDocument.DocumentString, this.newDocument.DocumentString); } /// /// 比较两个文档 /// /// public CompareResult StringCompare(string oldtext,string newtext) { CompareResult result = new CompareResult(); var differ = new Differ(); if(oldtext == null) { oldtext = ""; } if(newtext == null) { newtext = ""; } result.oldWordCount = oldtext.Length; result.newWordCount = newtext.Length; var diff = differ.CreateCharacterDiffs(oldtext, newtext, true); //result.EditCount = diff.DiffBlocks.Count; int lastPos = 0; string _CompareResultString = ""; string lastResult = ""; List ModifyList = new List(); foreach (var change in diff.DiffBlocks) { string strModifyStr = ""; lastResult += oldtext.Substring(lastPos, change.DeleteStartA - lastPos); _CompareResultString += oldtext.Substring(lastPos, change.DeleteStartA - lastPos); lastPos = change.DeleteStartA + change.DeleteCountA; if (change.DeleteCountA > 0) { strModifyStr += $"{oldtext.Substring(change.DeleteStartA, change.DeleteCountA)}"; _CompareResultString += $"{oldtext.Substring(change.DeleteStartA, change.DeleteCountA)}"; } if (change.InsertCountB > 0) { strModifyStr += $"{newtext.Substring(change.InsertStartB, change.InsertCountB)}"; _CompareResultString += $"{newtext.Substring(change.InsertStartB, change.InsertCountB)}"; lastResult += newtext.Substring(change.InsertStartB, change.InsertCountB); } if(!ModifyList.Contains(strModifyStr)) { ModifyList.Add(strModifyStr); result.DeleteCount += change.DeleteCountA; result.InsertCount += change.InsertCountB; result.EditCount += 1; } } lastResult += oldtext.Substring(lastPos); _CompareResultString += oldtext.Substring(lastPos); _CompareResultString = _CompareResultString.Replace("\r\n", "
"); result.CompareResultString = _CompareResultString; result.TextSimilarity = CosineSimilarity.Calculate(oldtext, newtext); return result; } } }