luocaiyang 11 months ago
parent
commit
2a0ba0512a
2 changed files with 270 additions and 0 deletions
  1. 268 0
      wispro.sp.utility/CompareDocx.cs
  2. 2 0
      wispro.sp.utility/wispro.sp.utility.csproj

+ 268 - 0
wispro.sp.utility/CompareDocx.cs

@@ -0,0 +1,268 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Text;
+using DiffPlex;
+using System.IO.Packaging;
+using System.Text.RegularExpressions;
+using NPOI.XWPF.UserModel;
+
+namespace wispro.sp.utility
+{
+    /// <summary>
+    /// 比较两个Docx文档中文字的不同
+    /// </summary>
+    public class CompareDocx
+    {
+        /// <summary>
+        /// 原文档路径
+        /// </summary>
+        public string oldDocument { get; set; }
+
+        /// <summary>
+        /// 修订后文档路径
+        /// </summary>
+        public string newDocument { get; set; }
+
+        /// <summary>
+        /// 总的修改比率
+        /// </summary>
+        public double diffRate
+        {
+            get
+            {
+                return ((double)_DeleteCount + (double)_InsertCount) / (double)_oldDocCount;
+            }
+        }
+
+        private int _oldDocCount;
+        /// <summary>
+        /// 原文档字数
+        /// </summary>
+        public int oldDocumentCount
+        {
+            get { return _oldDocCount; }
+        }
+
+        private int _newDocCount;
+        /// <summary>
+        /// 修订后文档字数
+        /// </summary>
+        public int newDocumentCount
+        {
+            get { return _newDocCount; }
+        }
+
+        private int _DeleteCount;
+        /// <summary>
+        /// 修订后文档相比原文档删除的字数
+        /// </summary>
+        public int DeleteCount
+        {
+            get
+            {
+                return _DeleteCount;
+            }
+        }
+
+        private int _InsertCount;
+
+        /// <summary>
+        /// 修改后文档相比原文档插入的字数
+        /// </summary>
+        public int InsertCount
+        {
+            get
+            {
+                return _InsertCount;
+            }
+        }
+
+        private double _EditCount;
+        /// <summary>
+        /// 修订处数量
+        /// </summary>
+        public double EditCount
+        {
+            get
+            {
+                return _EditCount;
+            }
+        }
+
+
+
+        private string _CompareResultString;
+        /// <summary>
+        /// 修订版本的文字
+        /// </summary>
+        public string CompareResultString
+        {
+            get
+            {
+                return _CompareResultString;
+            }
+        }
+
+        /// <summary>
+        /// 比较两个文档
+        /// </summary>
+        /// <param name="oldFile"></param>
+        /// <param name="newFile"></param>
+        public void Compare(string oldFile, string newFile)
+        {
+            this.oldDocument = oldFile;
+            this.newDocument = newFile;
+
+            Compare();
+        }
+
+        /// <summary>
+        /// 比较两个文档
+        /// </summary>
+        /// <exception cref="ApplicationException"></exception>
+        public void Compare()
+        {
+            if (!System.IO.File.Exists(this.oldDocument) || !System.IO.File.Exists(this.newDocument))
+            {
+                throw new ApplicationException("指定的文件不存在!");
+            }
+
+            var differ = new Differ();
+
+            var oldtext = GetDocTxt(this.oldDocument);
+            _oldDocCount = oldtext.Length;
+
+            var newtext = GetDocxTxt(this.newDocument);
+            _newDocCount = newtext.Length;
+
+            var diff = differ.CreateCharacterDiffs(oldtext, newtext, true);
+            _EditCount = diff.DiffBlocks.Count;
+
+            int iDeff = 0;
+            int lastPos = 0;
+
+            _CompareResultString = "<p>";
+            string lastResult = "";
+            foreach (var change in diff.DiffBlocks)
+            {
+                iDeff += change.DeleteCountA + change.InsertCountB;
+                _DeleteCount += change.DeleteCountA;
+                _InsertCount += change.InsertCountB;
+
+                lastResult += oldtext.Substring(lastPos, change.DeleteStartA - lastPos);
+                _CompareResultString += oldtext.Substring(lastPos, change.DeleteStartA - lastPos);
+                lastPos = change.DeleteStartA + change.DeleteCountA;
+
+
+                if (change.DeleteCountA > 0)
+                {
+                    _CompareResultString += $"<strike  style=\"text-decoration: line-through; color: red;\">{oldtext.Substring(change.DeleteStartA, change.DeleteCountA)}</strike>";
+                }
+
+                if (change.InsertCountB > 0)
+                {
+                    _CompareResultString += $"<u style=\"text-decoration: underline; color: blue;\">{newtext.Substring(change.InsertStartB, change.InsertCountB)}</u>";
+                    lastResult += newtext.Substring(change.InsertStartB, change.InsertCountB);
+                }
+            }
+
+            lastResult += oldtext.Substring(lastPos);
+            _CompareResultString += oldtext.Substring(lastPos);
+            _CompareResultString = _CompareResultString.Replace("\r\n", "</p>\r\n<p>") + "</p>";
+
+            //_diffRate = (double)iDeff / (double)oldtext.Length;
+
+            //Console.WriteLine($"修改字数:{iDeff}\r\n原文档字数:{oldtext.Length}\r\n修改比率:{(_diffRate * 100.00).ToString("0.0000")}%");
+        }
+
+        private string GetDocTxt(string filepath)
+        {
+            using (var stream = File.OpenRead(filepath))
+            {
+
+                XWPFDocument doc = new XWPFDocument(stream);
+                string text = "";
+
+                foreach (var para in doc.Paragraphs)
+                {
+                    text += "\r\n" + para.Text;
+                }
+
+                return text.Trim();
+            }
+        }
+
+        private string GetDocxTxt(string filepath)
+        {
+            var oldtext = getDocxMainXml(filepath);
+            var oldlines = ExtractWPTextFromXml(oldtext);
+            oldtext = List2String(oldlines);
+
+            return oldtext;
+        }
+
+        private string List2String(List<string> lines)
+        {
+            StringBuilder sb = new StringBuilder();
+            foreach (var line in lines)
+            {
+                if (!string.IsNullOrEmpty(line))
+                {
+                    sb.Append(line + "\r\n");
+                }
+            }
+
+            return sb.ToString();
+        }
+
+        private string getDocxMainXml(string filePath)
+        {
+            string text = string.Empty;
+            using (Package package = Package.Open(filePath, FileMode.Open))
+            {
+                var Parts = package.GetParts();
+
+                foreach (var part in Parts)
+                {
+                    if (part.ContentType.StartsWith("application/vnd.openxmlformats-officedocument.wordprocessingml.document.main"))
+                    {
+                        using (Stream stream = part.GetStream())
+                        {
+                            StreamReader reader = new StreamReader(stream);
+                            text = reader.ReadToEnd();
+                            break;
+                        }
+                    }
+                }
+
+                return text;
+            }
+        }
+
+        private List<string> ExtractWPTextFromXml(string xmlText)
+        {
+            List<string> lines = new List<string>();
+            // 使用正则表达式匹配 <w:t> 标签的内容
+            MatchCollection matches = Regex.Matches(xmlText, "(<w:p\\s.*?>|<w:p>)(.*?)</w:p>");
+
+            foreach (Match match in matches)
+            {
+                lines.Add(ExtractWtTextFromXml(match.Groups[2].Value));
+            }
+            return lines;
+        }
+
+        private string ExtractWtTextFromXml(string xmlText)
+        {
+            // 使用正则表达式匹配 <w:t> 标签的内容
+            MatchCollection matches = Regex.Matches(xmlText, "(<w:t\\s.*?>|<w:t>)(.*?)</w:t>");
+            StringBuilder sb = new StringBuilder();
+            foreach (Match match in matches)
+            {
+                sb.Append(match.Groups[2].Value);
+            }
+            return sb.ToString();
+        }
+    }
+}

+ 2 - 0
wispro.sp.utility/wispro.sp.utility.csproj

@@ -10,6 +10,8 @@
   </ItemGroup>
 
   <ItemGroup>
+    <PackageReference Include="DiffPlex" Version="1.7.2" />
+    <PackageReference Include="DocumentFormat.OpenXml" Version="3.1.1" />
     <PackageReference Include="MailKit" Version="2.15.0" />
     <PackageReference Include="Microsoft.AspNetCore.Mvc.NewtonsoftJson" Version="5.0.10" />
     <PackageReference Include="Microsoft.Extensions.Configuration" Version="5.0.0" />