CompareDocx.cs 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268
  1. using System;
  2. using System.Collections.Generic;
  3. using System.IO;
  4. using System.Text;
  5. using DiffPlex;
  6. using System.IO.Packaging;
  7. using System.Text.RegularExpressions;
  8. using NPOI.XWPF.UserModel;
  9. namespace wispro.sp.utility
  10. {
  11. /// <summary>
  12. /// 比较两个Docx文档中文字的不同
  13. /// </summary>
  14. public class CompareDocx
  15. {
  16. /// <summary>
  17. /// 原文档路径
  18. /// </summary>
  19. public string oldDocument { get; set; }
  20. /// <summary>
  21. /// 修订后文档路径
  22. /// </summary>
  23. public string newDocument { get; set; }
  24. /// <summary>
  25. /// 总的修改比率
  26. /// </summary>
  27. public double diffRate
  28. {
  29. get
  30. {
  31. return ((double)_DeleteCount + (double)_InsertCount) / (double)_oldDocCount;
  32. }
  33. }
  34. private int _oldDocCount;
  35. /// <summary>
  36. /// 原文档字数
  37. /// </summary>
  38. public int oldDocumentCount
  39. {
  40. get { return _oldDocCount; }
  41. }
  42. private int _newDocCount;
  43. /// <summary>
  44. /// 修订后文档字数
  45. /// </summary>
  46. public int newDocumentCount
  47. {
  48. get { return _newDocCount; }
  49. }
  50. private int _DeleteCount;
  51. /// <summary>
  52. /// 修订后文档相比原文档删除的字数
  53. /// </summary>
  54. public int DeleteCount
  55. {
  56. get
  57. {
  58. return _DeleteCount;
  59. }
  60. }
  61. private int _InsertCount;
  62. /// <summary>
  63. /// 修改后文档相比原文档插入的字数
  64. /// </summary>
  65. public int InsertCount
  66. {
  67. get
  68. {
  69. return _InsertCount;
  70. }
  71. }
  72. private double _EditCount;
  73. /// <summary>
  74. /// 修订处数量
  75. /// </summary>
  76. public double EditCount
  77. {
  78. get
  79. {
  80. return _EditCount;
  81. }
  82. }
  83. private string _CompareResultString;
  84. /// <summary>
  85. /// 修订版本的文字
  86. /// </summary>
  87. public string CompareResultString
  88. {
  89. get
  90. {
  91. return _CompareResultString;
  92. }
  93. }
  94. /// <summary>
  95. /// 比较两个文档
  96. /// </summary>
  97. /// <param name="oldFile"></param>
  98. /// <param name="newFile"></param>
  99. public void Compare(string oldFile, string newFile)
  100. {
  101. this.oldDocument = oldFile;
  102. this.newDocument = newFile;
  103. Compare();
  104. }
  105. /// <summary>
  106. /// 比较两个文档
  107. /// </summary>
  108. /// <exception cref="ApplicationException"></exception>
  109. public void Compare()
  110. {
  111. if (!System.IO.File.Exists(this.oldDocument) || !System.IO.File.Exists(this.newDocument))
  112. {
  113. throw new ApplicationException("指定的文件不存在!");
  114. }
  115. var differ = new Differ();
  116. var oldtext = GetDocTxt(this.oldDocument);
  117. _oldDocCount = oldtext.Length;
  118. var newtext = GetDocxTxt(this.newDocument);
  119. _newDocCount = newtext.Length;
  120. var diff = differ.CreateCharacterDiffs(oldtext, newtext, true);
  121. _EditCount = diff.DiffBlocks.Count;
  122. int iDeff = 0;
  123. int lastPos = 0;
  124. _CompareResultString = "<p>";
  125. string lastResult = "";
  126. foreach (var change in diff.DiffBlocks)
  127. {
  128. iDeff += change.DeleteCountA + change.InsertCountB;
  129. _DeleteCount += change.DeleteCountA;
  130. _InsertCount += change.InsertCountB;
  131. lastResult += oldtext.Substring(lastPos, change.DeleteStartA - lastPos);
  132. _CompareResultString += oldtext.Substring(lastPos, change.DeleteStartA - lastPos);
  133. lastPos = change.DeleteStartA + change.DeleteCountA;
  134. if (change.DeleteCountA > 0)
  135. {
  136. _CompareResultString += $"<strike style=\"text-decoration: line-through; color: red;\">{oldtext.Substring(change.DeleteStartA, change.DeleteCountA)}</strike>";
  137. }
  138. if (change.InsertCountB > 0)
  139. {
  140. _CompareResultString += $"<u style=\"text-decoration: underline; color: blue;\">{newtext.Substring(change.InsertStartB, change.InsertCountB)}</u>";
  141. lastResult += newtext.Substring(change.InsertStartB, change.InsertCountB);
  142. }
  143. }
  144. lastResult += oldtext.Substring(lastPos);
  145. _CompareResultString += oldtext.Substring(lastPos);
  146. _CompareResultString = _CompareResultString.Replace("\r\n", "</p>\r\n<p>") + "</p>";
  147. //_diffRate = (double)iDeff / (double)oldtext.Length;
  148. //Console.WriteLine($"修改字数:{iDeff}\r\n原文档字数:{oldtext.Length}\r\n修改比率:{(_diffRate * 100.00).ToString("0.0000")}%");
  149. }
  150. private string GetDocTxt(string filepath)
  151. {
  152. using (var stream = File.OpenRead(filepath))
  153. {
  154. XWPFDocument doc = new XWPFDocument(stream);
  155. string text = "";
  156. foreach (var para in doc.Paragraphs)
  157. {
  158. text += "\r\n" + para.Text;
  159. }
  160. return text.Trim();
  161. }
  162. }
  163. private string GetDocxTxt(string filepath)
  164. {
  165. var oldtext = getDocxMainXml(filepath);
  166. var oldlines = ExtractWPTextFromXml(oldtext);
  167. oldtext = List2String(oldlines);
  168. return oldtext;
  169. }
  170. private string List2String(List<string> lines)
  171. {
  172. StringBuilder sb = new StringBuilder();
  173. foreach (var line in lines)
  174. {
  175. if (!string.IsNullOrEmpty(line))
  176. {
  177. sb.Append(line + "\r\n");
  178. }
  179. }
  180. return sb.ToString();
  181. }
  182. private string getDocxMainXml(string filePath)
  183. {
  184. string text = string.Empty;
  185. using (Package package = Package.Open(filePath, FileMode.Open))
  186. {
  187. var Parts = package.GetParts();
  188. foreach (var part in Parts)
  189. {
  190. if (part.ContentType.StartsWith("application/vnd.openxmlformats-officedocument.wordprocessingml.document.main"))
  191. {
  192. using (Stream stream = part.GetStream())
  193. {
  194. StreamReader reader = new StreamReader(stream);
  195. text = reader.ReadToEnd();
  196. break;
  197. }
  198. }
  199. }
  200. return text;
  201. }
  202. }
  203. private List<string> ExtractWPTextFromXml(string xmlText)
  204. {
  205. List<string> lines = new List<string>();
  206. // 使用正则表达式匹配 <w:t> 标签的内容
  207. MatchCollection matches = Regex.Matches(xmlText, "(<w:p\\s.*?>|<w:p>)(.*?)</w:p>");
  208. foreach (Match match in matches)
  209. {
  210. lines.Add(ExtractWtTextFromXml(match.Groups[2].Value));
  211. }
  212. return lines;
  213. }
  214. private string ExtractWtTextFromXml(string xmlText)
  215. {
  216. // 使用正则表达式匹配 <w:t> 标签的内容
  217. MatchCollection matches = Regex.Matches(xmlText, "(<w:t\\s.*?>|<w:t>)(.*?)</w:t>");
  218. StringBuilder sb = new StringBuilder();
  219. foreach (Match match in matches)
  220. {
  221. sb.Append(match.Groups[2].Value);
  222. }
  223. return sb.ToString();
  224. }
  225. }
  226. }