using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using DiffPlex;
using System.IO.Packaging;
using System.Text.RegularExpressions;
using NPOI.XWPF.UserModel;
namespace wispro.sp.utility
{
///
/// 比较两个Docx文档中文字的不同
///
public class CompareDocx
{
///
/// 原文档路径
///
public string oldDocument { get; set; }
///
/// 修订后文档路径
///
public string newDocument { get; set; }
///
/// 总的修改比率
///
public double diffRate
{
get
{
return ((double)_DeleteCount + (double)_InsertCount) / (double)_oldDocCount;
}
}
private int _oldDocCount;
///
/// 原文档字数
///
public int oldDocumentCount
{
get { return _oldDocCount; }
}
private int _newDocCount;
///
/// 修订后文档字数
///
public int newDocumentCount
{
get { return _newDocCount; }
}
private int _DeleteCount;
///
/// 修订后文档相比原文档删除的字数
///
public int DeleteCount
{
get
{
return _DeleteCount;
}
}
private int _InsertCount;
///
/// 修改后文档相比原文档插入的字数
///
public int InsertCount
{
get
{
return _InsertCount;
}
}
private double _EditCount;
///
/// 修订处数量
///
public double EditCount
{
get
{
return _EditCount;
}
}
private string _CompareResultString;
///
/// 修订版本的文字
///
public string CompareResultString
{
get
{
return _CompareResultString;
}
}
///
/// 比较两个文档
///
///
///
public void Compare(string oldFile, string newFile)
{
this.oldDocument = oldFile;
this.newDocument = newFile;
Compare();
}
///
/// 比较两个文档
///
///
public void Compare()
{
if (!System.IO.File.Exists(this.oldDocument) || !System.IO.File.Exists(this.newDocument))
{
throw new ApplicationException("指定的文件不存在!");
}
var differ = new Differ();
var oldtext = GetDocTxt(this.oldDocument);
_oldDocCount = oldtext.Length;
var newtext = GetDocxTxt(this.newDocument);
_newDocCount = newtext.Length;
var diff = differ.CreateCharacterDiffs(oldtext, newtext, true);
_EditCount = diff.DiffBlocks.Count;
int iDeff = 0;
int lastPos = 0;
_CompareResultString = "
";
string lastResult = "";
foreach (var change in diff.DiffBlocks)
{
iDeff += change.DeleteCountA + change.InsertCountB;
_DeleteCount += change.DeleteCountA;
_InsertCount += change.InsertCountB;
lastResult += oldtext.Substring(lastPos, change.DeleteStartA - lastPos);
_CompareResultString += oldtext.Substring(lastPos, change.DeleteStartA - lastPos);
lastPos = change.DeleteStartA + change.DeleteCountA;
if (change.DeleteCountA > 0)
{
_CompareResultString += $"{oldtext.Substring(change.DeleteStartA, change.DeleteCountA)}";
}
if (change.InsertCountB > 0)
{
_CompareResultString += $"{newtext.Substring(change.InsertStartB, change.InsertCountB)}";
lastResult += newtext.Substring(change.InsertStartB, change.InsertCountB);
}
}
lastResult += oldtext.Substring(lastPos);
_CompareResultString += oldtext.Substring(lastPos);
_CompareResultString = _CompareResultString.Replace("\r\n", "
\r\n") + "
";
//_diffRate = (double)iDeff / (double)oldtext.Length;
//Console.WriteLine($"修改字数:{iDeff}\r\n原文档字数:{oldtext.Length}\r\n修改比率:{(_diffRate * 100.00).ToString("0.0000")}%");
}
private string GetDocTxt(string filepath)
{
using (var stream = File.OpenRead(filepath))
{
XWPFDocument doc = new XWPFDocument(stream);
string text = "";
foreach (var para in doc.Paragraphs)
{
text += "\r\n" + para.Text;
}
return text.Trim();
}
}
private string GetDocxTxt(string filepath)
{
var oldtext = getDocxMainXml(filepath);
var oldlines = ExtractWPTextFromXml(oldtext);
oldtext = List2String(oldlines);
return oldtext;
}
private string List2String(List lines)
{
StringBuilder sb = new StringBuilder();
foreach (var line in lines)
{
if (!string.IsNullOrEmpty(line))
{
sb.Append(line + "\r\n");
}
}
return sb.ToString();
}
private string getDocxMainXml(string filePath)
{
string text = string.Empty;
using (Package package = Package.Open(filePath, FileMode.Open))
{
var Parts = package.GetParts();
foreach (var part in Parts)
{
if (part.ContentType.StartsWith("application/vnd.openxmlformats-officedocument.wordprocessingml.document.main"))
{
using (Stream stream = part.GetStream())
{
StreamReader reader = new StreamReader(stream);
text = reader.ReadToEnd();
break;
}
}
}
return text;
}
}
private List ExtractWPTextFromXml(string xmlText)
{
List lines = new List();
// 使用正则表达式匹配 标签的内容
MatchCollection matches = Regex.Matches(xmlText, "(|)(.*?)");
foreach (Match match in matches)
{
lines.Add(ExtractWtTextFromXml(match.Groups[2].Value));
}
return lines;
}
private string ExtractWtTextFromXml(string xmlText)
{
// 使用正则表达式匹配 标签的内容
MatchCollection matches = Regex.Matches(xmlText, "(|)(.*?)");
StringBuilder sb = new StringBuilder();
foreach (Match match in matches)
{
sb.Append(match.Groups[2].Value);
}
return sb.ToString();
}
}
}