CompareDocx.cs 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313
  1. using System;
  2. using System.Collections.Generic;
  3. using System.IO;
  4. using System.Text;
  5. using DiffPlex;
  6. using System.IO.Packaging;
  7. using System.Text.RegularExpressions;
  8. using System.Linq;
  9. using Microsoft.Office.Interop.Word;
  10. namespace wispro.sp.utility
  11. {
  12. /// <summary>
  13. /// 比较两个Docx文档中文字的不同
  14. /// </summary>
  15. public class CompareDocx
  16. {
  17. /// <summary>
  18. /// 原文档路径
  19. /// </summary>
  20. public string oldDocument { get; set; }
  21. /// <summary>
  22. /// 修订后文档路径
  23. /// </summary>
  24. public string newDocument { get; set; }
  25. /// <summary>
  26. /// 总的修改比率
  27. /// </summary>
  28. public double diffRate
  29. {
  30. get
  31. {
  32. return ((double)_DeleteCount + (double)_InsertCount) / (double)_oldDocCount;
  33. }
  34. }
  35. private int _oldDocCount;
  36. /// <summary>
  37. /// 原文档字数
  38. /// </summary>
  39. public int oldDocumentCount
  40. {
  41. get { return _oldDocCount; }
  42. }
  43. private int _newDocCount;
  44. /// <summary>
  45. /// 修订后文档字数
  46. /// </summary>
  47. public int newDocumentCount
  48. {
  49. get { return _newDocCount; }
  50. }
  51. private int _DeleteCount;
  52. /// <summary>
  53. /// 修订后文档相比原文档删除的字数
  54. /// </summary>
  55. public int DeleteCount
  56. {
  57. get
  58. {
  59. return _DeleteCount;
  60. }
  61. }
  62. private int _InsertCount;
  63. /// <summary>
  64. /// 修改后文档相比原文档插入的字数
  65. /// </summary>
  66. public int InsertCount
  67. {
  68. get
  69. {
  70. return _InsertCount;
  71. }
  72. }
  73. private double _EditCount;
  74. /// <summary>
  75. /// 修订处数量
  76. /// </summary>
  77. public double EditCount
  78. {
  79. get
  80. {
  81. return _EditCount;
  82. }
  83. }
  84. private string _CompareResultString;
  85. /// <summary>
  86. /// 包括修订文字版本的文档
  87. /// </summary>
  88. public string CompareResultString
  89. {
  90. get
  91. {
  92. return _CompareResultString;
  93. }
  94. }
  95. /// <summary>
  96. /// 比较两个文档
  97. /// </summary>
  98. /// <param name="oldFile"></param>
  99. /// <param name="newFile"></param>
  100. public void Compare(string oldFile, string newFile)
  101. {
  102. this.oldDocument = oldFile;
  103. this.newDocument = newFile;
  104. Compare();
  105. }
  106. /// <summary>
  107. /// 比较两个文档
  108. /// </summary>
  109. /// <exception cref="ApplicationException"></exception>
  110. public void Compare()
  111. {
  112. if (!System.IO.File.Exists(this.oldDocument) || !System.IO.File.Exists(this.newDocument))
  113. {
  114. throw new ApplicationException("指定的文件不存在!");
  115. }
  116. var differ = new Differ();
  117. string oldtext = "";
  118. if (this.oldDocument.EndsWith(".doc"))
  119. {
  120. oldtext = GetDocTxt(this.oldDocument);
  121. }
  122. else
  123. {
  124. oldtext = GetDocxTxt(this.oldDocument);
  125. }
  126. _oldDocCount = oldtext.Length;
  127. string newtext = "";
  128. if (this.newDocument.EndsWith(".doc"))
  129. {
  130. newtext = GetDocTxt(this.newDocument);
  131. }
  132. else
  133. {
  134. newtext = GetDocxTxt(this.newDocument);
  135. }
  136. _newDocCount = newtext.Length;
  137. var diff = differ.CreateCharacterDiffs(oldtext, newtext, true);
  138. _EditCount = diff.DiffBlocks.Count;
  139. int iDeff = 0;
  140. int lastPos = 0;
  141. _CompareResultString = "<p>";
  142. string lastResult = "";
  143. foreach (var change in diff.DiffBlocks)
  144. {
  145. iDeff += change.DeleteCountA + change.InsertCountB;
  146. _DeleteCount += change.DeleteCountA;
  147. _InsertCount += change.InsertCountB;
  148. lastResult += oldtext.Substring(lastPos, change.DeleteStartA - lastPos);
  149. _CompareResultString += oldtext.Substring(lastPos, change.DeleteStartA - lastPos);
  150. lastPos = change.DeleteStartA + change.DeleteCountA;
  151. if (change.DeleteCountA > 0)
  152. {
  153. _CompareResultString += $"<strike style=\"text-decoration: line-through; color: red;\">{oldtext.Substring(change.DeleteStartA, change.DeleteCountA)}</strike>";
  154. }
  155. if (change.InsertCountB > 0)
  156. {
  157. _CompareResultString += $"<u style=\"text-decoration: underline; color: blue;\">{newtext.Substring(change.InsertStartB, change.InsertCountB)}</u>";
  158. lastResult += newtext.Substring(change.InsertStartB, change.InsertCountB);
  159. }
  160. }
  161. lastResult += oldtext.Substring(lastPos);
  162. _CompareResultString += oldtext.Substring(lastPos);
  163. _CompareResultString = _CompareResultString.Replace("\r\n", "<br/>");
  164. }
  165. private string GetDocTxt(string filePath)
  166. {
  167. Application word = null;
  168. Document doc = null;
  169. string content = string.Empty;
  170. try
  171. {
  172. // 创建Word应用实例
  173. word = new Application();
  174. // 打开Word文档
  175. System.IO.FileInfo fileInfo = new System.IO.FileInfo(filePath);
  176. doc = word.Documents.Open(fileInfo.FullName);
  177. // 读取文档内容
  178. content = doc.Content.Text;
  179. List<string> lines = content.Split("\r").ToList();
  180. return List2String(lines);
  181. }
  182. catch (Exception ex)
  183. {
  184. throw new Exception($"读取Word文档时发生错误: {ex.Message}");
  185. }
  186. finally
  187. {
  188. // 关闭文档
  189. if (doc != null)
  190. {
  191. doc.Close();
  192. #pragma warning disable CA1416 // 验证平台兼容性
  193. System.Runtime.InteropServices.Marshal.ReleaseComObject(doc);
  194. #pragma warning restore CA1416 // 验证平台兼容性
  195. }
  196. // 退出Word应用
  197. if (word != null)
  198. {
  199. word.Quit();
  200. #pragma warning disable CA1416 // 验证平台兼容性
  201. System.Runtime.InteropServices.Marshal.ReleaseComObject(word);
  202. #pragma warning restore CA1416 // 验证平台兼容性
  203. }
  204. }
  205. }
  206. private string GetDocxTxt(string filepath)
  207. {
  208. var oldtext = getDocxMainXml(filepath);
  209. var oldlines = ExtractWPTextFromXml(oldtext);
  210. oldtext = List2String(oldlines);
  211. return oldtext;
  212. }
  213. private string List2String(List<string> lines)
  214. {
  215. StringBuilder sb = new StringBuilder();
  216. foreach (var line in lines)
  217. {
  218. if (!string.IsNullOrEmpty(line))
  219. {
  220. sb.Append(line.Trim() + "\r\n");
  221. }
  222. }
  223. return sb.ToString();
  224. }
  225. private string getDocxMainXml(string filePath)
  226. {
  227. string text = string.Empty;
  228. using (Package package = Package.Open(filePath, FileMode.Open))
  229. {
  230. var Parts = package.GetParts();
  231. foreach (var part in Parts)
  232. {
  233. if (part.ContentType.StartsWith("application/vnd.openxmlformats-officedocument.wordprocessingml.document.main"))
  234. {
  235. using (Stream stream = part.GetStream())
  236. {
  237. StreamReader reader = new StreamReader(stream);
  238. text = reader.ReadToEnd();
  239. break;
  240. }
  241. }
  242. }
  243. return text;
  244. }
  245. }
  246. private List<string> ExtractWPTextFromXml(string xmlText)
  247. {
  248. List<string> lines = new List<string>();
  249. // 使用正则表达式匹配 <w:t> 标签的内容
  250. MatchCollection matches = Regex.Matches(xmlText, "(<w:p\\s.*?>|<w:p>)(.*?)</w:p>");
  251. foreach (Match match in matches)
  252. {
  253. lines.Add(ExtractWtTextFromXml(match.Groups[2].Value));
  254. }
  255. return lines;
  256. }
  257. private string ExtractWtTextFromXml(string xmlText)
  258. {
  259. // 使用正则表达式匹配 <w:t> 标签的内容
  260. MatchCollection matches = Regex.Matches(xmlText, "(<w:t\\s.*?>|<w:t>)(.*?)</w:t>");
  261. StringBuilder sb = new StringBuilder();
  262. foreach (Match match in matches)
  263. {
  264. sb.Append(match.Groups[2].Value);
  265. }
  266. return sb.ToString();
  267. }
  268. }
  269. }