CompareDocx.cs 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322
  1. using System;
  2. using System.Collections.Generic;
  3. using System.IO;
  4. using System.Text;
  5. using DiffPlex;
  6. using System.IO.Packaging;
  7. using System.Text.RegularExpressions;
  8. using System.Linq;
  9. using Microsoft.Office.Interop.Word;
  10. using wispro.sp.entity.CompareCase;
  11. namespace wispro.sp.utility
  12. {
  13. /// <summary>
  14. /// 比较两个Docx文档中文字的不同
  15. /// </summary>
  16. public class CompareDocx
  17. {
  18. public class PatentDocument
  19. {
  20. public string FilePath { get; set; }
  21. public string Abstract { get; set; }
  22. public string Claim { get; set; }
  23. public string FullText { get; set; }
  24. public string DocumentString { get; set; }
  25. public PatentDocument(string filePath) {
  26. this.FilePath = filePath;
  27. if (!System.IO.File.Exists(this.FilePath) )
  28. {
  29. throw new ApplicationException("指定的文件不存在!");
  30. }
  31. if (this.FilePath.EndsWith(".doc"))
  32. {
  33. DocumentString = GetDocTxt(this.FilePath);
  34. }
  35. else
  36. {
  37. if (this.FilePath.ToLower().EndsWith(".docx"))
  38. {
  39. DocumentString = GetDocxTxt(this.FilePath);
  40. }
  41. }
  42. }
  43. private string GetDocTxt(string filePath)
  44. {
  45. Application word = null;
  46. Document doc = null;
  47. string content = string.Empty;
  48. try
  49. {
  50. // 创建Word应用实例
  51. word = new Application();
  52. // 打开Word文档
  53. System.IO.FileInfo fileInfo = new System.IO.FileInfo(filePath);
  54. doc = word.Documents.Open(fileInfo.FullName);
  55. // 读取文档内容
  56. content = doc.Content.Text;
  57. List<string> lines = content.Split(new string[] { "\f", "\r" }, StringSplitOptions.None).ToList();
  58. return List2String(lines);
  59. }
  60. catch (Exception ex)
  61. {
  62. throw new Exception($"读取Word文档时发生错误: {ex.Message}");
  63. }
  64. finally
  65. {
  66. // 关闭文档
  67. if (doc != null)
  68. {
  69. doc.Close();
  70. #pragma warning disable CA1416 // 验证平台兼容性
  71. System.Runtime.InteropServices.Marshal.ReleaseComObject(doc);
  72. #pragma warning restore CA1416 // 验证平台兼容性
  73. }
  74. // 退出Word应用
  75. if (word != null)
  76. {
  77. word.Quit();
  78. #pragma warning disable CA1416 // 验证平台兼容性
  79. System.Runtime.InteropServices.Marshal.ReleaseComObject(word);
  80. #pragma warning restore CA1416 // 验证平台兼容性
  81. }
  82. }
  83. }
  84. private string GetDocxTxt(string filepath)
  85. {
  86. var oldtext = getDocxMainXml(filepath);
  87. var oldlines = ExtractWPTextFromXml(oldtext);
  88. oldtext = List2String(oldlines);
  89. return oldtext;
  90. }
  91. private string List2String(List<string> lines)
  92. {
  93. string[] array = { "权利要求书", "说明书摘要", "说明书" ,"背景技术","发明内容", "技术领域", "具体实施方式", "摘要附图", "说明书附图" };
  94. StringBuilder sb = new StringBuilder();
  95. string lastBlock = string.Empty;
  96. foreach (var line in lines)
  97. {
  98. if (!string.IsNullOrEmpty(line))
  99. {
  100. sb.Append(line.Trim() + "\r\n");
  101. if(Array.Exists(array, element => element == line.Replace("\u0001","").Replace(" ","").Trim()))
  102. {
  103. lastBlock = line.Replace("\u0001", "").Replace(" ", "").Trim();
  104. }
  105. else
  106. {
  107. switch(lastBlock) {
  108. case "权利要求书":
  109. this.Claim = this.Claim + "\r\n" + line;
  110. break;
  111. case "说明书摘要":
  112. this.Abstract = this.Abstract + "\r\n" + line;
  113. break;
  114. case "说明书":
  115. case "背景技术":
  116. case "发明内容":
  117. case "技术领域":
  118. case "具体实施方式":
  119. this.FullText = this.FullText + "\r\n" + line;
  120. break;
  121. }
  122. }
  123. }
  124. }
  125. this.Abstract = string.IsNullOrEmpty(this.Abstract)?string.Empty: this.Abstract.Trim();
  126. this.Claim = string.IsNullOrEmpty(this.Claim) ? string.Empty : this.Claim.Trim();
  127. this.FullText = string.IsNullOrEmpty(this.FullText) ? string.Empty : this.FullText.Trim();
  128. return sb.ToString();
  129. }
  130. private string getDocxMainXml(string filePath)
  131. {
  132. string text = string.Empty;
  133. using (Package package = Package.Open(filePath, FileMode.Open))
  134. {
  135. var Parts = package.GetParts();
  136. foreach (var part in Parts)
  137. {
  138. if (part.ContentType.StartsWith("application/vnd.openxmlformats-officedocument.wordprocessingml.document.main"))
  139. {
  140. using (Stream stream = part.GetStream())
  141. {
  142. StreamReader reader = new StreamReader(stream);
  143. text = reader.ReadToEnd();
  144. break;
  145. }
  146. }
  147. }
  148. return text;
  149. }
  150. }
  151. private List<string> ExtractWPTextFromXml(string xmlText)
  152. {
  153. List<string> lines = new List<string>();
  154. // 使用正则表达式匹配 <w:t> 标签的内容
  155. MatchCollection matches = Regex.Matches(xmlText, "(<w:p\\s.*?>|<w:p>)(.*?)</w:p>");
  156. foreach (Match match in matches)
  157. {
  158. lines.Add(ExtractWtTextFromXml(match.Groups[2].Value));
  159. }
  160. return lines;
  161. }
  162. private string ExtractWtTextFromXml(string xmlText)
  163. {
  164. // 使用正则表达式匹配 <w:t> 标签的内容
  165. MatchCollection matches = Regex.Matches(xmlText, "(<w:t\\s.*?>|<w:t>)(.*?)</w:t>");
  166. StringBuilder sb = new StringBuilder();
  167. foreach (Match match in matches)
  168. {
  169. sb.Append(match.Groups[2].Value);
  170. }
  171. return sb.ToString();
  172. }
  173. }
  174. /// <summary>
  175. /// 原文档路径
  176. /// </summary>
  177. public PatentDocument oldDocument { get; set; }
  178. /// <summary>
  179. /// 修订后文档路径
  180. /// </summary>
  181. public PatentDocument newDocument { get; set; }
  182. /// <summary>
  183. /// 权力要求比较结果
  184. /// </summary>
  185. public CompareResult ClaimResult { get; set; }
  186. /// <summary>
  187. /// 摘要比较结果
  188. /// </summary>
  189. public CompareResult AbstractResult { get; set; }
  190. /// <summary>
  191. /// 说明书比较结果
  192. /// </summary>
  193. public CompareResult FulltextResult { get; set; }
  194. /// <summary>
  195. /// 所有文字比较结果
  196. /// </summary>
  197. public CompareResult AllStringResult { get; set; }
  198. /// <summary>
  199. /// 比较两个文档
  200. /// </summary>
  201. /// <param name="oldFile"></param>
  202. /// <param name="newFile"></param>
  203. public void Compare(string oldFile, string newFile)
  204. {
  205. this.oldDocument =new PatentDocument(oldFile);
  206. this.newDocument =new PatentDocument(newFile);
  207. this.ClaimResult = StringCompare(this.oldDocument.Claim,this.newDocument.Claim);
  208. this.AbstractResult = StringCompare(this.oldDocument.Abstract, this.newDocument.Abstract);
  209. this.FulltextResult = StringCompare(this.oldDocument.FullText, this.newDocument.FullText);
  210. this.AllStringResult = StringCompare(this.oldDocument.DocumentString, this.newDocument.DocumentString);
  211. }
  212. /// <summary>
  213. /// 比较两个文档
  214. /// </summary>
  215. /// <exception cref="ApplicationException"></exception>
  216. public CompareResult StringCompare(string oldtext,string newtext)
  217. {
  218. CompareResult result = new CompareResult();
  219. var differ = new Differ();
  220. if(oldtext == null) { oldtext = ""; }
  221. if(newtext == null) { newtext = ""; }
  222. result.oldWordCount = oldtext.Length;
  223. result.newWordCount = newtext.Length;
  224. var diff = differ.CreateCharacterDiffs(oldtext, newtext, true);
  225. //result.EditCount = diff.DiffBlocks.Count;
  226. int lastPos = 0;
  227. string _CompareResultString = "";
  228. string lastResult = "";
  229. List<string> ModifyList = new List<string>();
  230. foreach (var change in diff.DiffBlocks)
  231. {
  232. string strModifyStr = "";
  233. lastResult += oldtext.Substring(lastPos, change.DeleteStartA - lastPos);
  234. _CompareResultString += oldtext.Substring(lastPos, change.DeleteStartA - lastPos);
  235. lastPos = change.DeleteStartA + change.DeleteCountA;
  236. if (change.DeleteCountA > 0)
  237. {
  238. strModifyStr += $"<strike style=\"text-decoration: line-through; color: red;\">{oldtext.Substring(change.DeleteStartA, change.DeleteCountA)}</strike>";
  239. _CompareResultString += $"<strike style=\"text-decoration: line-through; color: red;\">{oldtext.Substring(change.DeleteStartA, change.DeleteCountA)}</strike>";
  240. }
  241. if (change.InsertCountB > 0)
  242. {
  243. strModifyStr += $"<u style=\"text-decoration: underline; color: blue;\">{newtext.Substring(change.InsertStartB, change.InsertCountB)}</u>";
  244. _CompareResultString += $"<u style=\"text-decoration: underline; color: blue;\">{newtext.Substring(change.InsertStartB, change.InsertCountB)}</u>";
  245. lastResult += newtext.Substring(change.InsertStartB, change.InsertCountB);
  246. }
  247. if(!ModifyList.Contains(strModifyStr))
  248. {
  249. ModifyList.Add(strModifyStr);
  250. result.DeleteCount += change.DeleteCountA;
  251. result.InsertCount += change.InsertCountB;
  252. result.EditCount += 1;
  253. }
  254. }
  255. lastResult += oldtext.Substring(lastPos);
  256. _CompareResultString += oldtext.Substring(lastPos);
  257. _CompareResultString = _CompareResultString.Replace("\r\n", "<br/>");
  258. result.CompareResultString = _CompareResultString;
  259. result.TextSimilarity = CosineSimilarity.Calculate(oldtext, newtext);
  260. return result;
  261. }
  262. }
  263. }