CompareDocx.cs 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357
  1. using System;
  2. using System.Collections.Generic;
  3. using System.IO;
  4. using System.Text;
  5. using DiffPlex;
  6. using System.IO.Packaging;
  7. using System.Text.RegularExpressions;
  8. using System.Linq;
  9. using Microsoft.Office.Interop.Word;
  10. using wispro.sp.entity.CompareCase;
  11. using System.Runtime.InteropServices;
  12. namespace wispro.sp.utility
  13. {
  14. /// <summary>
  15. /// 比较两个Docx文档中文字的不同
  16. /// </summary>
  17. public class CompareDocx
  18. {
  19. public class PatentDocument
  20. {
  21. public string FilePath { get; set; }
  22. public string Abstract { get; set; }
  23. public string Claim { get; set; }
  24. public string FullText { get; set; }
  25. public string DocumentString { get; set; }
  26. public PatentDocument(string filePath)
  27. {
  28. this.FilePath = filePath;
  29. if (!System.IO.File.Exists(this.FilePath))
  30. {
  31. throw new ApplicationException("指定的文件不存在!");
  32. }
  33. if (this.FilePath.EndsWith(".doc"))
  34. {
  35. DocumentString = GetDocTxt(this.FilePath);
  36. }
  37. else
  38. {
  39. if (this.FilePath.ToLower().EndsWith(".docx"))
  40. {
  41. DocumentString = GetDocxTxt(this.FilePath);
  42. }
  43. }
  44. }
  45. private string GetDocTxt(string filePath)
  46. {
  47. Application word = null;
  48. Document doc = null;
  49. string content = string.Empty;
  50. try
  51. {
  52. // 创建Word应用实例
  53. word = new Application();
  54. // 打开Word文档
  55. System.IO.FileInfo fileInfo = new System.IO.FileInfo(filePath);
  56. if (fileInfo.Name.Contains("-保密-") || fileInfo.Name.Contains("加密版"))
  57. {
  58. return null;
  59. }
  60. else
  61. {
  62. try
  63. {
  64. doc = word.Documents.Open(
  65. fileInfo.FullName,
  66. ReadOnly: true,
  67. PasswordDocument: Type.Missing,
  68. WritePasswordDocument: "");
  69. // 读取文档内容
  70. content = doc.Content.Text;
  71. List<string> lines = content.Split(new string[] { "\f", "\r" }, StringSplitOptions.None).ToList();
  72. return List2String(lines);
  73. }
  74. catch (COMException ex) when (ex.ErrorCode == -2146823167 || ex.ErrorCode == -2147024891) // 0x80070005
  75. {
  76. return null; // 访问被拒绝 → 文档加密
  77. }
  78. catch
  79. {
  80. throw; // 其他异常(如文件损坏)需单独处理
  81. }
  82. }
  83. }
  84. catch (Exception ex)
  85. {
  86. throw new Exception($"读取Word文档时发生错误: {ex.Message}");
  87. }
  88. finally
  89. {
  90. // 关闭文档
  91. if (doc != null)
  92. {
  93. doc.Close();
  94. #pragma warning disable CA1416 // 验证平台兼容性
  95. System.Runtime.InteropServices.Marshal.ReleaseComObject(doc);
  96. #pragma warning restore CA1416 // 验证平台兼容性
  97. }
  98. // 退出Word应用
  99. if (word != null)
  100. {
  101. word.Quit();
  102. #pragma warning disable CA1416 // 验证平台兼容性
  103. System.Runtime.InteropServices.Marshal.ReleaseComObject(word);
  104. #pragma warning restore CA1416 // 验证平台兼容性
  105. }
  106. }
  107. }
  108. private string GetDocxTxt(string filepath)
  109. {
  110. var oldtext = getDocxMainXml(filepath);
  111. var oldlines = ExtractWPTextFromXml(oldtext);
  112. oldtext = List2String(oldlines);
  113. return oldtext;
  114. }
  115. private string List2String(List<string> lines)
  116. {
  117. string[] array = { "权利要求书", "说明书摘要", "说明书", "背景技术", "发明内容", "技术领域", "具体实施方式", "摘要附图", "说明书附图" };
  118. StringBuilder sb = new StringBuilder();
  119. string lastBlock = string.Empty;
  120. foreach (var line in lines)
  121. {
  122. if (!string.IsNullOrEmpty(line))
  123. {
  124. sb.Append(line.Trim() + "\r\n");
  125. if (Array.Exists(array, element => element == line.Replace("\u0001", "").Replace(" ", "").Trim()))
  126. {
  127. lastBlock = line.Replace("\u0001", "").Replace(" ", "").Trim();
  128. }
  129. else
  130. {
  131. switch (lastBlock)
  132. {
  133. case "权利要求书":
  134. this.Claim = this.Claim + "\r\n" + line;
  135. break;
  136. case "说明书摘要":
  137. this.Abstract = this.Abstract + "\r\n" + line;
  138. break;
  139. case "说明书":
  140. case "背景技术":
  141. case "发明内容":
  142. case "技术领域":
  143. case "具体实施方式":
  144. this.FullText = this.FullText + "\r\n" + line;
  145. break;
  146. }
  147. }
  148. }
  149. }
  150. this.Abstract = string.IsNullOrEmpty(this.Abstract) ? string.Empty : this.Abstract.Trim();
  151. this.Claim = string.IsNullOrEmpty(this.Claim) ? string.Empty : this.Claim.Trim();
  152. this.FullText = string.IsNullOrEmpty(this.FullText) ? string.Empty : this.FullText.Trim();
  153. return sb.ToString();
  154. }
  155. private string getDocxMainXml(string filePath)
  156. {
  157. string text = string.Empty;
  158. using (Package package = Package.Open(filePath, FileMode.Open))
  159. {
  160. var Parts = package.GetParts();
  161. foreach (var part in Parts)
  162. {
  163. if (part.ContentType.StartsWith("application/vnd.openxmlformats-officedocument.wordprocessingml.document.main"))
  164. {
  165. using (Stream stream = part.GetStream())
  166. {
  167. StreamReader reader = new StreamReader(stream);
  168. text = reader.ReadToEnd();
  169. break;
  170. }
  171. }
  172. }
  173. return text;
  174. }
  175. }
  176. private List<string> ExtractWPTextFromXml(string xmlText)
  177. {
  178. List<string> lines = new List<string>();
  179. // 使用正则表达式匹配 <w:t> 标签的内容
  180. MatchCollection matches = Regex.Matches(xmlText, "(<w:p\\s.*?>|<w:p>)(.*?)</w:p>");
  181. foreach (Match match in matches)
  182. {
  183. lines.Add(ExtractWtTextFromXml(match.Groups[2].Value));
  184. }
  185. return lines;
  186. }
  187. private string ExtractWtTextFromXml(string xmlText)
  188. {
  189. // 使用正则表达式匹配 <w:t> 标签的内容
  190. MatchCollection matches = Regex.Matches(xmlText, "(<w:t\\s.*?>|<w:t>)(.*?)</w:t>");
  191. StringBuilder sb = new StringBuilder();
  192. foreach (Match match in matches)
  193. {
  194. sb.Append(match.Groups[2].Value);
  195. }
  196. return sb.ToString();
  197. }
  198. }
  199. /// <summary>
  200. /// 原文档路径
  201. /// </summary>
  202. public PatentDocument oldDocument { get; set; }
  203. /// <summary>
  204. /// 修订后文档路径
  205. /// </summary>
  206. public PatentDocument newDocument { get; set; }
  207. /// <summary>
  208. /// 权力要求比较结果
  209. /// </summary>
  210. public CompareResult ClaimResult { get; set; }
  211. /// <summary>
  212. /// 摘要比较结果
  213. /// </summary>
  214. public CompareResult AbstractResult { get; set; }
  215. /// <summary>
  216. /// 说明书比较结果
  217. /// </summary>
  218. public CompareResult FulltextResult { get; set; }
  219. /// <summary>
  220. /// 所有文字比较结果
  221. /// </summary>
  222. public CompareResult AllStringResult { get; set; }
  223. /// <summary>
  224. /// 比较两个文档
  225. /// </summary>
  226. /// <param name="oldFile"></param>
  227. /// <param name="newFile"></param>
  228. public void Compare(string oldFile, string newFile)
  229. {
  230. this.oldDocument = new PatentDocument(oldFile);
  231. this.newDocument = new PatentDocument(newFile);
  232. if (!string.IsNullOrEmpty(this.oldDocument.Claim) && !string.IsNullOrEmpty(this.newDocument.Claim))
  233. {
  234. this.ClaimResult = StringCompare(this.oldDocument.Claim, this.newDocument.Claim);
  235. }
  236. if (!string.IsNullOrEmpty(this.oldDocument.Abstract) && !string.IsNullOrEmpty(this.newDocument.Abstract))
  237. this.AbstractResult = StringCompare(this.oldDocument.Abstract, this.newDocument.Abstract);
  238. if (!string.IsNullOrEmpty(this.oldDocument.FullText) && !string.IsNullOrEmpty(this.newDocument.FullText))
  239. this.FulltextResult = StringCompare(this.oldDocument.FullText, this.newDocument.FullText);
  240. if (!string.IsNullOrEmpty(this.oldDocument.DocumentString) && !string.IsNullOrEmpty(this.newDocument.DocumentString))
  241. this.AllStringResult = StringCompare(this.oldDocument.DocumentString, this.newDocument.DocumentString);
  242. }
  243. /// <summary>
  244. /// 比较两个文档
  245. /// </summary>
  246. /// <exception cref="ApplicationException"></exception>
  247. public CompareResult StringCompare(string oldtext, string newtext)
  248. {
  249. CompareResult result = new CompareResult();
  250. var differ = new Differ();
  251. if (oldtext == null) { oldtext = ""; }
  252. if (newtext == null) { newtext = ""; }
  253. result.oldWordCount = oldtext.Length;
  254. result.newWordCount = newtext.Length;
  255. var diff = differ.CreateCharacterDiffs(oldtext, newtext, true);
  256. //result.EditCount = diff.DiffBlocks.Count;
  257. int lastPos = 0;
  258. string _CompareResultString = "";
  259. string lastResult = "";
  260. List<string> ModifyList = new List<string>();
  261. foreach (var change in diff.DiffBlocks)
  262. {
  263. string strModifyStr = "";
  264. lastResult += oldtext.Substring(lastPos, change.DeleteStartA - lastPos);
  265. _CompareResultString += oldtext.Substring(lastPos, change.DeleteStartA - lastPos);
  266. lastPos = change.DeleteStartA + change.DeleteCountA;
  267. if (change.DeleteCountA > 0)
  268. {
  269. strModifyStr += $"<strike style=\"text-decoration: line-through; color: red;\">{oldtext.Substring(change.DeleteStartA, change.DeleteCountA)}</strike>";
  270. _CompareResultString += $"<strike style=\"text-decoration: line-through; color: red;\">{oldtext.Substring(change.DeleteStartA, change.DeleteCountA)}</strike>";
  271. }
  272. if (change.InsertCountB > 0)
  273. {
  274. strModifyStr += $"<u style=\"text-decoration: underline; color: blue;\">{newtext.Substring(change.InsertStartB, change.InsertCountB)}</u>";
  275. _CompareResultString += $"<u style=\"text-decoration: underline; color: blue;\">{newtext.Substring(change.InsertStartB, change.InsertCountB)}</u>";
  276. lastResult += newtext.Substring(change.InsertStartB, change.InsertCountB);
  277. }
  278. if (!ModifyList.Contains(strModifyStr))
  279. {
  280. ModifyList.Add(strModifyStr);
  281. result.DeleteCount += change.DeleteCountA;
  282. result.InsertCount += change.InsertCountB;
  283. result.EditCount += 1;
  284. }
  285. }
  286. lastResult += oldtext.Substring(lastPos);
  287. _CompareResultString += oldtext.Substring(lastPos);
  288. _CompareResultString = _CompareResultString.Replace("\r\n", "<br/>");
  289. result.CompareResultString = _CompareResultString;
  290. result.TextSimilarity = CosineSimilarity.Calculate(oldtext, newtext);
  291. return result;
  292. }
  293. }
  294. }