Browse Source

添加文本语义相似度计算(词语的余弦相似度)

luocaiyang 10 tháng trước cách đây
mục cha
commit
e7a530b02e

+ 19 - 5
wispro.sp.utility/CompareDocx.cs

@@ -237,6 +237,8 @@ namespace wispro.sp.utility
                 }
             }
 
+            public double TextSimilarity { get; set; }
+
         }
         /// <summary>
         /// 原文档路径
@@ -301,18 +303,17 @@ namespace wispro.sp.utility
             result.newWordCount = newtext.Length;
             
             var diff = differ.CreateCharacterDiffs(oldtext, newtext, true);
-            result.EditCount = diff.DiffBlocks.Count;
+            //result.EditCount = diff.DiffBlocks.Count;
 
-            int iDeff = 0;
             int lastPos = 0;
 
             string _CompareResultString = "";
             string lastResult = "";
+            List<string> ModifyList = new List<string>();
             foreach (var change in diff.DiffBlocks)
             {
-                iDeff += change.DeleteCountA + change.InsertCountB;
-                result.DeleteCount += change.DeleteCountA;
-                result.InsertCount += change.InsertCountB;
+                string strModifyStr = "";
+                
 
                 lastResult += oldtext.Substring(lastPos, change.DeleteStartA - lastPos);
                 _CompareResultString += oldtext.Substring(lastPos, change.DeleteStartA - lastPos);
@@ -321,14 +322,25 @@ namespace wispro.sp.utility
 
                 if (change.DeleteCountA > 0)
                 {
+                    strModifyStr += $"<strike  style=\"text-decoration: line-through; color: red;\">{oldtext.Substring(change.DeleteStartA, change.DeleteCountA)}</strike>";
                     _CompareResultString += $"<strike  style=\"text-decoration: line-through; color: red;\">{oldtext.Substring(change.DeleteStartA, change.DeleteCountA)}</strike>";
                 }
 
                 if (change.InsertCountB > 0)
                 {
+                    strModifyStr += $"<u style=\"text-decoration: underline; color: blue;\">{newtext.Substring(change.InsertStartB, change.InsertCountB)}</u>";
                     _CompareResultString += $"<u style=\"text-decoration: underline; color: blue;\">{newtext.Substring(change.InsertStartB, change.InsertCountB)}</u>";
                     lastResult += newtext.Substring(change.InsertStartB, change.InsertCountB);
                 }
+
+                if(!ModifyList.Contains(strModifyStr))
+                {
+                    ModifyList.Add(strModifyStr);
+                    result.DeleteCount += change.DeleteCountA;
+                    result.InsertCount += change.InsertCountB;
+                    result.EditCount += 1;
+                }
+                
             }
 
             lastResult += oldtext.Substring(lastPos);
@@ -337,6 +349,8 @@ namespace wispro.sp.utility
 
             result.CompareResultString = _CompareResultString;
 
+            result.TextSimilarity = CosineSimilarity.Calculate(oldtext, newtext);
+
             return result;
 
         }

+ 51 - 0
wispro.sp.utility/CosineSimilarity.cs

@@ -0,0 +1,51 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace wispro.sp.utility
+{
+    public class CosineSimilarity
+    {
+        public static double Calculate(string s1, string s2)
+        {
+            // 将字符串转换为词频向量
+            var vector1 = GetTermFrequencyVector(s1);
+            var vector2 = GetTermFrequencyVector(s2);
+
+            // 计算余弦相似度
+            double dotProduct = 0;
+            double norm1 = 0;
+            double norm2 = 0;
+
+            foreach (var term in vector1.Keys.Union(vector2.Keys))
+            {
+                double v1 = vector1.ContainsKey(term) ? vector1[term] : 0;
+                double v2 = vector2.ContainsKey(term) ? vector2[term] : 0;
+
+                dotProduct += v1 * v2;
+                norm1 += v1 * v1;
+                norm2 += v2 * v2;
+            }
+
+            return dotProduct / (Math.Sqrt(norm1) * Math.Sqrt(norm2));
+        }
+
+        private static Dictionary<string, double> GetTermFrequencyVector(string text)
+        {
+
+            var terms = new Jieba_Segmenter().Cut(text.Replace("\r\n","").Replace("\r","").Replace("\n", ""));
+            var vector = new Dictionary<string, double>();
+
+            foreach (var term in terms)
+            {
+                if (!vector.ContainsKey(term))
+                    vector[term] = 0;
+                vector[term]++;
+            }
+
+            return vector;
+        }
+    }
+}

+ 79 - 5
wispro.sp.utility/IPEasyUtility.cs

@@ -228,14 +228,16 @@ namespace wispro.sp.utility
                     inputSearch.SendKeys(caseNo.Trim());
 
                     var btnSearch = waitGetElementById(wait, "btn_Search");// driver.FindElement(By.Id("btn_Search"));
-                    btnSearch.Click();
-
+                    driver.ExecuteJavaScript("arguments[0].click();",btnSearch);
+                    //btnSearch.Click();
+                    
                     try
                     {
-                        var caseLink = wait.Until((d) => { 
+                        var caseLink = wait.Until((d) =>
+                        {
                             return d.FindElement(By.XPath($"//a[contains(text(),'{caseNo}')]"));
-                        }); 
-                        caseLink.Click();
+                        });
+                        driver.ExecuteJavaScript("arguments[0].click();", caseLink);
                     }
                     catch(Exception ex)
                     {
@@ -353,6 +355,78 @@ namespace wispro.sp.utility
 
             return retObject;
         }
+        /// <summary>
+        /// 获取当前为递交中或前一天完成的新申请案件清单
+        /// </summary>
+        /// <param name="type">类型0:递交中、1:前一天完成</param>
+        /// <returns></returns>
+        /// <exception cref="Exception"></exception>
+        public static DataTable GetFinished3FilesCases(int type)
+        {
+            DataTable  retObject = new DataTable();
+            
+
+            using (IWebDriver driver = CreateChromeDriver())
+            {
+                try
+                {
+                    WebDriverWait wait = new WebDriverWait(driver, TimeSpan.FromSeconds(5));
+
+                    Login(driver, wait);
+
+                    //点击顶部菜单栏中的案件管理菜单
+                    IWebElement linkCaseManager = waitGetElementByName(wait, "71A7CC35-F597-40E1-9FEF-BE622A3A3B63");
+
+                    linkCaseManager.Click();
+
+                    IWebElement linkCaseSearch = wait.Until((d) =>
+                    {
+                        try
+                        {
+                            return driver.FindElement(By.LinkText("案件查询"));
+                        }
+                        catch
+                        {
+                            return null;
+                        }
+
+                    });
+
+                    driver.ExecuteJavaScript("arguments[0].click();", linkCaseSearch);
+                    //linkCaseSearch.Click();
+
+                    IWebElement patentSearch = waitGetElementByName(wait, "4df7eee3-426f-4ce5-9204-34ccb0fd27f7");
+                    driver.ExecuteJavaScript("arguments[0].click();", patentSearch);
+                    //patentSearch.Click();
+
+                    driver.SwitchTo().Frame(1);
+
+                    //var inputSearch = waitGetElementById(wait, "case_volume");//driver.FindElement(By.Id("case_volume"));
+                    //inputSearch.SendKeys(caseNo.Trim());
+                    #region 添加检索条件
+
+                    #endregion
+
+                    var btnSearch = waitGetElementById(wait, "btn_Search");// driver.FindElement(By.Id("btn_Search"));
+                    driver.ExecuteJavaScript("arguments[0].click();", btnSearch);
+
+                    #region 导出检索结果
+                    #endregion
+
+                }
+                catch (Exception ex)
+                {
+                    throw new Exception(ex.Message, ex);
+                }
+                finally
+                {
+                    driver.Quit();
+                    killChromProcess();
+                }
+            }
+
+            return retObject;
+        }
 
         private static void Download(dynamic retObject, IWebDriver driver, WebDriverWait wait, IWebElement table_filelist, string fileType)
         {

+ 38 - 0
wispro.sp.utility/JiebaSegmenter.cs

@@ -0,0 +1,38 @@
+using System.Collections.Generic;
+using System.Linq;
+using JiebaNet.Segmenter;
+
+namespace wispro.sp.utility
+{
+    
+    public class Jieba_Segmenter
+    {
+        private readonly JiebaSegmenter _segmenter;
+
+        public Jieba_Segmenter()
+        {
+            _segmenter = new JiebaSegmenter();
+        }
+
+        public List<string> Cut(string text)
+        {
+            // 精确模式
+            var words = _segmenter.Cut(text);
+            return words.ToList();
+        }
+
+        public List<string> CutForSearch(string text)
+        {
+            // 搜索引擎模式
+            var words = _segmenter.CutForSearch(text);
+            return words.ToList();
+        }
+
+        public List<string> CutAll(string text)
+        {
+            // 全模式
+            var words = _segmenter.Cut(text,true,true);
+            return words.ToList();
+        }
+    }
+}

+ 1 - 0
wispro.sp.utility/wispro.sp.utility.csproj

@@ -24,6 +24,7 @@
   <ItemGroup>
     <PackageReference Include="DiffPlex" Version="1.7.2" />
     <PackageReference Include="DocumentFormat.OpenXml" Version="3.1.1" />
+    <PackageReference Include="jieba.NET" Version="0.42.2" />
     <PackageReference Include="MailKit" Version="2.15.0" />
     <PackageReference Include="Microsoft.AspNetCore.Mvc.NewtonsoftJson" Version="5.0.10" />
     <PackageReference Include="Microsoft.Extensions.Configuration" Version="5.0.0" />

+ 13 - 0
wispro.sp.winClient/frmCaseFileCompare.Designer.cs

@@ -34,6 +34,7 @@
             comboBox1 = new System.Windows.Forms.ComboBox();
             richTextBox1 = new System.Windows.Forms.RichTextBox();
             lblMsg = new System.Windows.Forms.Label();
+            button2 = new System.Windows.Forms.Button();
             SuspendLayout();
             // 
             // button1
@@ -92,11 +93,22 @@
             lblMsg.Size = new System.Drawing.Size(0, 28);
             lblMsg.TabIndex = 6;
             // 
+            // button2
+            // 
+            button2.Location = new System.Drawing.Point(777, 30);
+            button2.Name = "button2";
+            button2.Size = new System.Drawing.Size(204, 42);
+            button2.TabIndex = 7;
+            button2.Text = "button2";
+            button2.UseVisualStyleBackColor = true;
+            button2.Click += button2_Click;
+            // 
             // frmCaseFileCompare
             // 
             AutoScaleDimensions = new System.Drawing.SizeF(13F, 28F);
             AutoScaleMode = System.Windows.Forms.AutoScaleMode.Font;
             ClientSize = new System.Drawing.Size(1837, 857);
+            Controls.Add(button2);
             Controls.Add(lblMsg);
             Controls.Add(richTextBox1);
             Controls.Add(comboBox1);
@@ -118,5 +130,6 @@
         private System.Windows.Forms.ComboBox comboBox1;
         private System.Windows.Forms.RichTextBox richTextBox1;
         private System.Windows.Forms.Label lblMsg;
+        private System.Windows.Forms.Button button2;
     }
 }

Những thai đổi đã bị hủy bỏ vì nó quá lớn
+ 296 - 17
wispro.sp.winClient/frmCaseFileCompare.cs


+ 4 - 0
wispro.sp.winClient/wispro.sp.winClient.csproj

@@ -8,6 +8,10 @@
   </PropertyGroup>
 
   <ItemGroup>
+    <PackageReference Include="SharpCompress" Version="0.38.0" />
+  </ItemGroup>
+
+  <ItemGroup>
     <ProjectReference Include="..\wispro.sp.api\wispro.sp.api.csproj" />
     <ProjectReference Include="..\wispro.sp.ipeasyApi\wispro.sp.ipeasyApi.csproj" />
     <ProjectReference Include="..\wispro.sp.share\wispro.sp.share.csproj" />