using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using DiffPlex;
using System.IO.Packaging;
using System.Text.RegularExpressions;
using System.Linq;
using Microsoft.Office.Interop.Word;
using wispro.sp.entity.CompareCase;
namespace wispro.sp.utility
{
///
/// 比较两个Docx文档中文字的不同
///
public class CompareDocx
{
public class PatentDocument
{
public string FilePath { get; set; }
public string Abstract { get; set; }
public string Claim { get; set; }
public string FullText { get; set; }
public string DocumentString { get; set; }
public PatentDocument(string filePath) {
this.FilePath = filePath;
if (!System.IO.File.Exists(this.FilePath) )
{
throw new ApplicationException("指定的文件不存在!");
}
if (this.FilePath.EndsWith(".doc"))
{
DocumentString = GetDocTxt(this.FilePath);
}
else
{
if (this.FilePath.ToLower().EndsWith(".docx"))
{
DocumentString = GetDocxTxt(this.FilePath);
}
}
}
private string GetDocTxt(string filePath)
{
Application word = null;
Document doc = null;
string content = string.Empty;
try
{
// 创建Word应用实例
word = new Application();
// 打开Word文档
System.IO.FileInfo fileInfo = new System.IO.FileInfo(filePath);
doc = word.Documents.Open(fileInfo.FullName);
// 读取文档内容
content = doc.Content.Text;
List lines = content.Split(new string[] { "\f", "\r" }, StringSplitOptions.None).ToList();
return List2String(lines);
}
catch (Exception ex)
{
throw new Exception($"读取Word文档时发生错误: {ex.Message}");
}
finally
{
// 关闭文档
if (doc != null)
{
doc.Close();
#pragma warning disable CA1416 // 验证平台兼容性
System.Runtime.InteropServices.Marshal.ReleaseComObject(doc);
#pragma warning restore CA1416 // 验证平台兼容性
}
// 退出Word应用
if (word != null)
{
word.Quit();
#pragma warning disable CA1416 // 验证平台兼容性
System.Runtime.InteropServices.Marshal.ReleaseComObject(word);
#pragma warning restore CA1416 // 验证平台兼容性
}
}
}
private string GetDocxTxt(string filepath)
{
var oldtext = getDocxMainXml(filepath);
var oldlines = ExtractWPTextFromXml(oldtext);
oldtext = List2String(oldlines);
return oldtext;
}
private string List2String(List lines)
{
string[] array = { "权利要求书", "说明书摘要", "说明书" ,"背景技术","发明内容", "技术领域", "具体实施方式", "摘要附图", "说明书附图" };
StringBuilder sb = new StringBuilder();
string lastBlock = string.Empty;
foreach (var line in lines)
{
if (!string.IsNullOrEmpty(line))
{
sb.Append(line.Trim() + "\r\n");
if(Array.Exists(array, element => element == line.Replace("\u0001","").Replace(" ","").Trim()))
{
lastBlock = line.Replace("\u0001", "").Replace(" ", "").Trim();
}
else
{
switch(lastBlock) {
case "权利要求书":
this.Claim = this.Claim + "\r\n" + line;
break;
case "说明书摘要":
this.Abstract = this.Abstract + "\r\n" + line;
break;
case "说明书":
case "背景技术":
case "发明内容":
case "技术领域":
case "具体实施方式":
this.FullText = this.FullText + "\r\n" + line;
break;
}
}
}
}
this.Abstract = string.IsNullOrEmpty(this.Abstract)?string.Empty: this.Abstract.Trim();
this.Claim = string.IsNullOrEmpty(this.Claim) ? string.Empty : this.Claim.Trim();
this.FullText = string.IsNullOrEmpty(this.FullText) ? string.Empty : this.FullText.Trim();
return sb.ToString();
}
private string getDocxMainXml(string filePath)
{
string text = string.Empty;
using (Package package = Package.Open(filePath, FileMode.Open))
{
var Parts = package.GetParts();
foreach (var part in Parts)
{
if (part.ContentType.StartsWith("application/vnd.openxmlformats-officedocument.wordprocessingml.document.main"))
{
using (Stream stream = part.GetStream())
{
StreamReader reader = new StreamReader(stream);
text = reader.ReadToEnd();
break;
}
}
}
return text;
}
}
private List ExtractWPTextFromXml(string xmlText)
{
List lines = new List();
// 使用正则表达式匹配 标签的内容
MatchCollection matches = Regex.Matches(xmlText, "(|)(.*?)");
foreach (Match match in matches)
{
lines.Add(ExtractWtTextFromXml(match.Groups[2].Value));
}
return lines;
}
private string ExtractWtTextFromXml(string xmlText)
{
// 使用正则表达式匹配 标签的内容
MatchCollection matches = Regex.Matches(xmlText, "(|)(.*?)");
StringBuilder sb = new StringBuilder();
foreach (Match match in matches)
{
sb.Append(match.Groups[2].Value);
}
return sb.ToString();
}
}
///
/// 原文档路径
///
public PatentDocument oldDocument { get; set; }
///
/// 修订后文档路径
///
public PatentDocument newDocument { get; set; }
///
/// 权力要求比较结果
///
public CompareResult ClaimResult { get; set; }
///
/// 摘要比较结果
///
public CompareResult AbstractResult { get; set; }
///
/// 说明书比较结果
///
public CompareResult FulltextResult { get; set; }
///
/// 所有文字比较结果
///
public CompareResult AllStringResult { get; set; }
///
/// 比较两个文档
///
///
///
public void Compare(string oldFile, string newFile)
{
this.oldDocument =new PatentDocument(oldFile);
this.newDocument =new PatentDocument(newFile);
this.ClaimResult = StringCompare(this.oldDocument.Claim,this.newDocument.Claim);
this.AbstractResult = StringCompare(this.oldDocument.Abstract, this.newDocument.Abstract);
this.FulltextResult = StringCompare(this.oldDocument.FullText, this.newDocument.FullText);
this.AllStringResult = StringCompare(this.oldDocument.DocumentString, this.newDocument.DocumentString);
}
///
/// 比较两个文档
///
///
public CompareResult StringCompare(string oldtext,string newtext)
{
CompareResult result = new CompareResult();
var differ = new Differ();
if(oldtext == null) { oldtext = ""; }
if(newtext == null) { newtext = ""; }
result.oldWordCount = oldtext.Length;
result.newWordCount = newtext.Length;
var diff = differ.CreateCharacterDiffs(oldtext, newtext, true);
//result.EditCount = diff.DiffBlocks.Count;
int lastPos = 0;
string _CompareResultString = "";
string lastResult = "";
List ModifyList = new List();
foreach (var change in diff.DiffBlocks)
{
string strModifyStr = "";
lastResult += oldtext.Substring(lastPos, change.DeleteStartA - lastPos);
_CompareResultString += oldtext.Substring(lastPos, change.DeleteStartA - lastPos);
lastPos = change.DeleteStartA + change.DeleteCountA;
if (change.DeleteCountA > 0)
{
strModifyStr += $"{oldtext.Substring(change.DeleteStartA, change.DeleteCountA)}";
_CompareResultString += $"{oldtext.Substring(change.DeleteStartA, change.DeleteCountA)}";
}
if (change.InsertCountB > 0)
{
strModifyStr += $"{newtext.Substring(change.InsertStartB, change.InsertCountB)}";
_CompareResultString += $"{newtext.Substring(change.InsertStartB, change.InsertCountB)}";
lastResult += newtext.Substring(change.InsertStartB, change.InsertCountB);
}
if(!ModifyList.Contains(strModifyStr))
{
ModifyList.Add(strModifyStr);
result.DeleteCount += change.DeleteCountA;
result.InsertCount += change.InsertCountB;
result.EditCount += 1;
}
}
lastResult += oldtext.Substring(lastPos);
_CompareResultString += oldtext.Substring(lastPos);
_CompareResultString = _CompareResultString.Replace("\r\n", "
");
result.CompareResultString = _CompareResultString;
result.TextSimilarity = CosineSimilarity.Calculate(oldtext, newtext);
return result;
}
}
}