CompareDocx.cs 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282
  1. using System;
  2. using System.Collections.Generic;
  3. using System.IO;
  4. using System.Text;
  5. using DiffPlex;
  6. using System.IO.Packaging;
  7. using System.Text.RegularExpressions;
  8. using NPOI.XWPF.UserModel;
  9. using System.Linq;
  10. using Microsoft.Office.Interop.Word;
  11. namespace wispro.sp.utility
  12. {
  13. /// <summary>
  14. /// 比较两个Docx文档中文字的不同
  15. /// </summary>
  16. public class CompareDocx
  17. {
  18. /// <summary>
  19. /// 原文档路径
  20. /// </summary>
  21. public string oldDocument { get; set; }
  22. /// <summary>
  23. /// 修订后文档路径
  24. /// </summary>
  25. public string newDocument { get; set; }
  26. /// <summary>
  27. /// 总的修改比率
  28. /// </summary>
  29. public double diffRate
  30. {
  31. get
  32. {
  33. return ((double)_DeleteCount + (double)_InsertCount) / (double)_oldDocCount;
  34. }
  35. }
  36. private int _oldDocCount;
  37. /// <summary>
  38. /// 原文档字数
  39. /// </summary>
  40. public int oldDocumentCount
  41. {
  42. get { return _oldDocCount; }
  43. }
  44. private int _newDocCount;
  45. /// <summary>
  46. /// 修订后文档字数
  47. /// </summary>
  48. public int newDocumentCount
  49. {
  50. get { return _newDocCount; }
  51. }
  52. private int _DeleteCount;
  53. /// <summary>
  54. /// 修订后文档相比原文档删除的字数
  55. /// </summary>
  56. public int DeleteCount
  57. {
  58. get
  59. {
  60. return _DeleteCount;
  61. }
  62. }
  63. private int _InsertCount;
  64. /// <summary>
  65. /// 修改后文档相比原文档插入的字数
  66. /// </summary>
  67. public int InsertCount
  68. {
  69. get
  70. {
  71. return _InsertCount;
  72. }
  73. }
  74. private double _EditCount;
  75. /// <summary>
  76. /// 修订处数量
  77. /// </summary>
  78. public double EditCount
  79. {
  80. get
  81. {
  82. return _EditCount;
  83. }
  84. }
  85. private string _CompareResultString;
  86. /// <summary>
  87. /// 修订版本的文字
  88. /// </summary>
  89. public string CompareResultString
  90. {
  91. get
  92. {
  93. return _CompareResultString;
  94. }
  95. }
  96. /// <summary>
  97. /// 比较两个文档
  98. /// </summary>
  99. /// <param name="oldFile"></param>
  100. /// <param name="newFile"></param>
  101. public void Compare(string oldFile, string newFile)
  102. {
  103. this.oldDocument = oldFile;
  104. this.newDocument = newFile;
  105. Compare();
  106. }
  107. /// <summary>
  108. /// 比较两个文档
  109. /// </summary>
  110. /// <exception cref="ApplicationException"></exception>
  111. public void Compare()
  112. {
  113. if (!System.IO.File.Exists(this.oldDocument) || !System.IO.File.Exists(this.newDocument))
  114. {
  115. throw new ApplicationException("指定的文件不存在!");
  116. }
  117. var differ = new Differ();
  118. string oldtext = "";
  119. if (this.oldDocument.EndsWith(".doc"))
  120. {
  121. oldtext = GetDocTxt(this.oldDocument);
  122. }
  123. else
  124. {
  125. oldtext = GetDocxTxt(this.oldDocument);
  126. }
  127. _oldDocCount = oldtext.Length;
  128. string newtext = "";
  129. if (this.newDocument.EndsWith(".doc"))
  130. {
  131. newtext = GetDocTxt(this.newDocument);
  132. }
  133. else
  134. {
  135. newtext = GetDocxTxt(this.newDocument);
  136. }
  137. _newDocCount = newtext.Length;
  138. var diff = differ.CreateCharacterDiffs(oldtext, newtext, true);
  139. _EditCount = diff.DiffBlocks.Count;
  140. int iDeff = 0;
  141. int lastPos = 0;
  142. _CompareResultString = "<p>";
  143. string lastResult = "";
  144. foreach (var change in diff.DiffBlocks)
  145. {
  146. iDeff += change.DeleteCountA + change.InsertCountB;
  147. _DeleteCount += change.DeleteCountA;
  148. _InsertCount += change.InsertCountB;
  149. lastResult += oldtext.Substring(lastPos, change.DeleteStartA - lastPos);
  150. _CompareResultString += oldtext.Substring(lastPos, change.DeleteStartA - lastPos);
  151. lastPos = change.DeleteStartA + change.DeleteCountA;
  152. if (change.DeleteCountA > 0)
  153. {
  154. _CompareResultString += $"<strike style=\"text-decoration: line-through; color: red;\">{oldtext.Substring(change.DeleteStartA, change.DeleteCountA)}</strike>";
  155. }
  156. if (change.InsertCountB > 0)
  157. {
  158. _CompareResultString += $"<u style=\"text-decoration: underline; color: blue;\">{newtext.Substring(change.InsertStartB, change.InsertCountB)}</u>";
  159. lastResult += newtext.Substring(change.InsertStartB, change.InsertCountB);
  160. }
  161. }
  162. lastResult += oldtext.Substring(lastPos);
  163. _CompareResultString += oldtext.Substring(lastPos);
  164. _CompareResultString = _CompareResultString.Replace("\r\n", "</p>\r\n<p>") + "</p>";
  165. }
  166. private string GetDocTxt(string filepath)
  167. {
  168. Application wordApp = new Application();
  169. System.IO.FileInfo fileInfo = new System.IO.FileInfo(filepath);
  170. Microsoft.Office.Interop.Word.Document doc = wordApp.Documents.Open(fileInfo.FullName);
  171. string text = doc.Content.Text;
  172. List<string> lines = text.Split("\r").ToList();
  173. text = List2String(lines);
  174. doc.Close();
  175. wordApp.Quit();
  176. return text;
  177. }
  178. private string GetDocxTxt(string filepath)
  179. {
  180. var oldtext = getDocxMainXml(filepath);
  181. var oldlines = ExtractWPTextFromXml(oldtext);
  182. oldtext = List2String(oldlines);
  183. return oldtext;
  184. }
  185. private string List2String(List<string> lines)
  186. {
  187. StringBuilder sb = new StringBuilder();
  188. foreach (var line in lines)
  189. {
  190. if (!string.IsNullOrEmpty(line))
  191. {
  192. sb.Append(line.Trim() + "\r\n");
  193. }
  194. }
  195. return sb.ToString();
  196. }
  197. private string getDocxMainXml(string filePath)
  198. {
  199. string text = string.Empty;
  200. using (Package package = Package.Open(filePath, FileMode.Open))
  201. {
  202. var Parts = package.GetParts();
  203. foreach (var part in Parts)
  204. {
  205. if (part.ContentType.StartsWith("application/vnd.openxmlformats-officedocument.wordprocessingml.document.main"))
  206. {
  207. using (Stream stream = part.GetStream())
  208. {
  209. StreamReader reader = new StreamReader(stream);
  210. text = reader.ReadToEnd();
  211. break;
  212. }
  213. }
  214. }
  215. return text;
  216. }
  217. }
  218. private List<string> ExtractWPTextFromXml(string xmlText)
  219. {
  220. List<string> lines = new List<string>();
  221. // 使用正则表达式匹配 <w:t> 标签的内容
  222. MatchCollection matches = Regex.Matches(xmlText, "(<w:p\\s.*?>|<w:p>)(.*?)</w:p>");
  223. foreach (Match match in matches)
  224. {
  225. lines.Add(ExtractWtTextFromXml(match.Groups[2].Value));
  226. }
  227. return lines;
  228. }
  229. private string ExtractWtTextFromXml(string xmlText)
  230. {
  231. // 使用正则表达式匹配 <w:t> 标签的内容
  232. MatchCollection matches = Regex.Matches(xmlText, "(<w:t\\s.*?>|<w:t>)(.*?)</w:t>");
  233. StringBuilder sb = new StringBuilder();
  234. foreach (Match match in matches)
  235. {
  236. sb.Append(match.Groups[2].Value);
  237. }
  238. return sb.ToString();
  239. }
  240. }
  241. }