CompareDocx.cs 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360
  1. using System;
  2. using System.Collections.Generic;
  3. using System.IO;
  4. using System.Text;
  5. using DiffPlex;
  6. using System.IO.Packaging;
  7. using System.Text.RegularExpressions;
  8. using System.Linq;
  9. using Microsoft.Office.Interop.Word;
  10. namespace wispro.sp.utility
  11. {
  12. /// <summary>
  13. /// 比较两个Docx文档中文字的不同
  14. /// </summary>
  15. public class CompareDocx
  16. {
  17. public class PatentDocument
  18. {
  19. public string FilePath { get; set; }
  20. public string Abstract { get; set; }
  21. public string Claim { get; set; }
  22. public string FullText { get; set; }
  23. public string DocumentString { get; set; }
  24. public PatentDocument(string filePath) {
  25. this.FilePath = filePath;
  26. if (!System.IO.File.Exists(this.FilePath) )
  27. {
  28. throw new ApplicationException("指定的文件不存在!");
  29. }
  30. if (this.FilePath.EndsWith(".doc"))
  31. {
  32. DocumentString = GetDocTxt(this.FilePath);
  33. }
  34. else
  35. {
  36. DocumentString = GetDocxTxt(this.FilePath);
  37. }
  38. }
  39. private string GetDocTxt(string filePath)
  40. {
  41. Application word = null;
  42. Document doc = null;
  43. string content = string.Empty;
  44. try
  45. {
  46. // 创建Word应用实例
  47. word = new Application();
  48. // 打开Word文档
  49. System.IO.FileInfo fileInfo = new System.IO.FileInfo(filePath);
  50. doc = word.Documents.Open(fileInfo.FullName);
  51. // 读取文档内容
  52. content = doc.Content.Text;
  53. List<string> lines = content.Split("\r").ToList();
  54. return List2String(lines);
  55. }
  56. catch (Exception ex)
  57. {
  58. throw new Exception($"读取Word文档时发生错误: {ex.Message}");
  59. }
  60. finally
  61. {
  62. // 关闭文档
  63. if (doc != null)
  64. {
  65. doc.Close();
  66. #pragma warning disable CA1416 // 验证平台兼容性
  67. System.Runtime.InteropServices.Marshal.ReleaseComObject(doc);
  68. #pragma warning restore CA1416 // 验证平台兼容性
  69. }
  70. // 退出Word应用
  71. if (word != null)
  72. {
  73. word.Quit();
  74. #pragma warning disable CA1416 // 验证平台兼容性
  75. System.Runtime.InteropServices.Marshal.ReleaseComObject(word);
  76. #pragma warning restore CA1416 // 验证平台兼容性
  77. }
  78. }
  79. }
  80. private string GetDocxTxt(string filepath)
  81. {
  82. var oldtext = getDocxMainXml(filepath);
  83. var oldlines = ExtractWPTextFromXml(oldtext);
  84. oldtext = List2String(oldlines);
  85. return oldtext;
  86. }
  87. private string List2String(List<string> lines)
  88. {
  89. string[] array = { "权 利 要 求 书", "说 明 书 摘 要", "说 明 书" , "摘 要 附 图", "说 明 书 附 图" };
  90. StringBuilder sb = new StringBuilder();
  91. string lastBlock = string.Empty;
  92. foreach (var line in lines)
  93. {
  94. if (!string.IsNullOrEmpty(line))
  95. {
  96. sb.Append(line.Trim() + "\r\n");
  97. if(Array.Exists(array, element => element == line.Trim()))
  98. {
  99. lastBlock = line.Trim();
  100. }
  101. else
  102. {
  103. switch(lastBlock) {
  104. case "权 利 要 求 书":
  105. this.Claim = this.Claim + "\r\n" + line;
  106. break;
  107. case "说 明 书 摘 要":
  108. this.Abstract = this.Abstract + "\r\n" + line;
  109. break;
  110. case "说 明 书":
  111. this.FullText = this.FullText + "\r\n" + line;
  112. break;
  113. }
  114. }
  115. }
  116. }
  117. this.Abstract = string.IsNullOrEmpty(this.Abstract)?string.Empty: this.Abstract.Trim();
  118. this.Claim = string.IsNullOrEmpty(this.Claim) ? string.Empty : this.Claim.Trim();
  119. this.FullText = string.IsNullOrEmpty(this.FullText) ? string.Empty : this.FullText.Trim();
  120. return sb.ToString();
  121. }
  122. private string getDocxMainXml(string filePath)
  123. {
  124. string text = string.Empty;
  125. using (Package package = Package.Open(filePath, FileMode.Open))
  126. {
  127. var Parts = package.GetParts();
  128. foreach (var part in Parts)
  129. {
  130. if (part.ContentType.StartsWith("application/vnd.openxmlformats-officedocument.wordprocessingml.document.main"))
  131. {
  132. using (Stream stream = part.GetStream())
  133. {
  134. StreamReader reader = new StreamReader(stream);
  135. text = reader.ReadToEnd();
  136. break;
  137. }
  138. }
  139. }
  140. return text;
  141. }
  142. }
  143. private List<string> ExtractWPTextFromXml(string xmlText)
  144. {
  145. List<string> lines = new List<string>();
  146. // 使用正则表达式匹配 <w:t> 标签的内容
  147. MatchCollection matches = Regex.Matches(xmlText, "(<w:p\\s.*?>|<w:p>)(.*?)</w:p>");
  148. foreach (Match match in matches)
  149. {
  150. lines.Add(ExtractWtTextFromXml(match.Groups[2].Value));
  151. }
  152. return lines;
  153. }
  154. private string ExtractWtTextFromXml(string xmlText)
  155. {
  156. // 使用正则表达式匹配 <w:t> 标签的内容
  157. MatchCollection matches = Regex.Matches(xmlText, "(<w:t\\s.*?>|<w:t>)(.*?)</w:t>");
  158. StringBuilder sb = new StringBuilder();
  159. foreach (Match match in matches)
  160. {
  161. sb.Append(match.Groups[2].Value);
  162. }
  163. return sb.ToString();
  164. }
  165. }
  166. public class StringCompareResult
  167. {
  168. /// <summary>
  169. /// 源字符串字数
  170. /// </summary>
  171. public int oldWordCount { get; set;}
  172. /// <summary>
  173. /// 新字符串字数
  174. /// </summary>
  175. public int newWordCount { get; set;}
  176. /// <summary>
  177. /// 修订后文档相比原文档删除的字数
  178. /// </summary>
  179. public int DeleteCount { get; set; }
  180. /// <summary>
  181. /// 修改后文档相比原文档插入的字数
  182. /// </summary>
  183. public int InsertCount { get; set; }
  184. /// <summary>
  185. /// 修订处数量
  186. /// </summary>
  187. public double EditCount{ get;set; }
  188. /// <summary>
  189. /// 包括修订文字版本的文档
  190. /// </summary>
  191. public string CompareResultString { get; set; }
  192. /// <summary>
  193. /// 总的修改比率
  194. /// </summary>
  195. public double diffRate
  196. {
  197. get
  198. {
  199. return ((double)DeleteCount + (double)InsertCount) / (double)oldWordCount;
  200. }
  201. }
  202. public double TextSimilarity { get; set; }
  203. }
  204. /// <summary>
  205. /// 原文档路径
  206. /// </summary>
  207. public PatentDocument oldDocument { get; set; }
  208. /// <summary>
  209. /// 修订后文档路径
  210. /// </summary>
  211. public PatentDocument newDocument { get; set; }
  212. /// <summary>
  213. /// 权力要求比较结果
  214. /// </summary>
  215. public StringCompareResult ClaimResult { get; set; }
  216. /// <summary>
  217. /// 摘要比较结果
  218. /// </summary>
  219. public StringCompareResult AbstractResult { get; set; }
  220. /// <summary>
  221. /// 说明书比较结果
  222. /// </summary>
  223. public StringCompareResult FulltextResult { get; set; }
  224. /// <summary>
  225. /// 所有文字比较结果
  226. /// </summary>
  227. public StringCompareResult AllStringResult { get; set; }
  228. /// <summary>
  229. /// 比较两个文档
  230. /// </summary>
  231. /// <param name="oldFile"></param>
  232. /// <param name="newFile"></param>
  233. public void Compare(string oldFile, string newFile)
  234. {
  235. this.oldDocument =new PatentDocument(oldFile);
  236. this.newDocument =new PatentDocument(newFile);
  237. this.ClaimResult = StringCompare(this.oldDocument.Claim,this.newDocument.Claim);
  238. this.AbstractResult = StringCompare(this.oldDocument.Abstract, this.newDocument.Abstract);
  239. this.FulltextResult = StringCompare(this.oldDocument.FullText, this.newDocument.FullText);
  240. this.AllStringResult = StringCompare(this.oldDocument.DocumentString, this.newDocument.DocumentString);
  241. }
  242. /// <summary>
  243. /// 比较两个文档
  244. /// </summary>
  245. /// <exception cref="ApplicationException"></exception>
  246. public StringCompareResult StringCompare(string oldtext,string newtext)
  247. {
  248. StringCompareResult result = new StringCompareResult();
  249. var differ = new Differ();
  250. if(oldtext == null) { oldtext = ""; }
  251. if(newtext == null) { newtext = ""; }
  252. result.oldWordCount = oldtext.Length;
  253. result.newWordCount = newtext.Length;
  254. var diff = differ.CreateCharacterDiffs(oldtext, newtext, true);
  255. //result.EditCount = diff.DiffBlocks.Count;
  256. int lastPos = 0;
  257. string _CompareResultString = "";
  258. string lastResult = "";
  259. List<string> ModifyList = new List<string>();
  260. foreach (var change in diff.DiffBlocks)
  261. {
  262. string strModifyStr = "";
  263. lastResult += oldtext.Substring(lastPos, change.DeleteStartA - lastPos);
  264. _CompareResultString += oldtext.Substring(lastPos, change.DeleteStartA - lastPos);
  265. lastPos = change.DeleteStartA + change.DeleteCountA;
  266. if (change.DeleteCountA > 0)
  267. {
  268. strModifyStr += $"<strike style=\"text-decoration: line-through; color: red;\">{oldtext.Substring(change.DeleteStartA, change.DeleteCountA)}</strike>";
  269. _CompareResultString += $"<strike style=\"text-decoration: line-through; color: red;\">{oldtext.Substring(change.DeleteStartA, change.DeleteCountA)}</strike>";
  270. }
  271. if (change.InsertCountB > 0)
  272. {
  273. strModifyStr += $"<u style=\"text-decoration: underline; color: blue;\">{newtext.Substring(change.InsertStartB, change.InsertCountB)}</u>";
  274. _CompareResultString += $"<u style=\"text-decoration: underline; color: blue;\">{newtext.Substring(change.InsertStartB, change.InsertCountB)}</u>";
  275. lastResult += newtext.Substring(change.InsertStartB, change.InsertCountB);
  276. }
  277. if(!ModifyList.Contains(strModifyStr))
  278. {
  279. ModifyList.Add(strModifyStr);
  280. result.DeleteCount += change.DeleteCountA;
  281. result.InsertCount += change.InsertCountB;
  282. result.EditCount += 1;
  283. }
  284. }
  285. lastResult += oldtext.Substring(lastPos);
  286. _CompareResultString += oldtext.Substring(lastPos);
  287. _CompareResultString = _CompareResultString.Replace("\r\n", "<br/>");
  288. result.CompareResultString = _CompareResultString;
  289. result.TextSimilarity = CosineSimilarity.Calculate(oldtext, newtext);
  290. return result;
  291. }
  292. }
  293. }