CosineSimilarity.cs 2.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. using System;
  2. using System.Collections.Generic;
  3. using System.Linq;
  4. using System.Text;
  5. using System.Threading.Tasks;
  6. namespace wispro.sp.utility
  7. {
  8. public class CosineSimilarity
  9. {
  10. public static double Calculate(string s1, string s2)
  11. {
  12. // 将字符串转换为词频向量
  13. var vector1 = GetTermFrequencyVector(s1);
  14. var vector2 = GetTermFrequencyVector(s2);
  15. // 计算余弦相似度
  16. double dotProduct = 0;
  17. double norm1 = 0;
  18. double norm2 = 0;
  19. foreach (var term in vector1.Keys.Union(vector2.Keys))
  20. {
  21. double v1 = vector1.ContainsKey(term) ? vector1[term] : 0;
  22. double v2 = vector2.ContainsKey(term) ? vector2[term] : 0;
  23. dotProduct += v1 * v2;
  24. norm1 += v1 * v1;
  25. norm2 += v2 * v2;
  26. }
  27. return dotProduct / (Math.Sqrt(norm1) * Math.Sqrt(norm2));
  28. }
  29. private static Dictionary<string, double> Normalize(Dictionary<string, double> vector)
  30. {
  31. double length = 0;
  32. foreach (var key in vector.Keys)
  33. {
  34. length += vector.GetValueOrDefault(key) * vector.GetValueOrDefault(key);
  35. }
  36. length = Math.Sqrt(length);
  37. Dictionary<string,double> result = new Dictionary<string, double>();
  38. foreach (var key in vector.Keys)
  39. {
  40. result.Add(key, vector.GetValueOrDefault(key)/length);
  41. }
  42. return result;
  43. }
  44. private static Dictionary<string, double> GetTermFrequencyVector(string text)
  45. {
  46. var terms = new Jieba_Segmenter().Cut(text.Replace("\r\n","").Replace("\r","").Replace("\n", ""));
  47. var vector = new Dictionary<string, double>();
  48. foreach (var term in terms)
  49. {
  50. if (!vector.ContainsKey(term))
  51. vector[term] = 0;
  52. vector[term]++;
  53. }
  54. return Normalize(vector);
  55. }
  56. }
  57. }