123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869 |
- using System;
- using System.Collections.Generic;
- using System.Linq;
- using System.Text;
- using System.Threading.Tasks;
- namespace wispro.sp.utility
- {
- public class CosineSimilarity
- {
- public static double Calculate(string s1, string s2)
- {
- // 将字符串转换为词频向量
- var vector1 = GetTermFrequencyVector(s1);
- var vector2 = GetTermFrequencyVector(s2);
- // 计算余弦相似度
- double dotProduct = 0;
- double norm1 = 0;
- double norm2 = 0;
- foreach (var term in vector1.Keys.Union(vector2.Keys))
- {
- double v1 = vector1.ContainsKey(term) ? vector1[term] : 0;
- double v2 = vector2.ContainsKey(term) ? vector2[term] : 0;
- dotProduct += v1 * v2;
- norm1 += v1 * v1;
- norm2 += v2 * v2;
- }
- return dotProduct / (Math.Sqrt(norm1) * Math.Sqrt(norm2));
- }
- private static Dictionary<string, double> Normalize(Dictionary<string, double> vector)
- {
- double length = 0;
- foreach (var key in vector.Keys)
- {
- length += vector.GetValueOrDefault(key) * vector.GetValueOrDefault(key);
- }
- length = Math.Sqrt(length);
- Dictionary<string,double> result = new Dictionary<string, double>();
- foreach (var key in vector.Keys)
- {
- result.Add(key, vector.GetValueOrDefault(key)/length);
- }
-
- return result;
- }
- private static Dictionary<string, double> GetTermFrequencyVector(string text)
- {
- var terms = new Jieba_Segmenter().Cut(text.Replace("\r\n","").Replace("\r","").Replace("\n", ""));
- var vector = new Dictionary<string, double>();
- foreach (var term in terms)
- {
- if (!vector.ContainsKey(term))
- vector[term] = 0;
- vector[term]++;
- }
- return Normalize(vector);
- }
- }
- }
|