1 | package nl.tno.metagenomics.imports |
---|

2 | |
---|

3 | class FuzzySearchService { |
---|

4 | |
---|

5 | static transactional = true |
---|

6 | |
---|

7 | // classes for fuzzy string matching |
---|

8 | // <FUZZY MATCHING> |
---|

9 | static def similarity(l_seq, r_seq, degree=2) { |
---|

10 | def l_histo = countNgramFrequency(l_seq, degree) |
---|

11 | def r_histo = countNgramFrequency(r_seq, degree) |
---|

12 | |
---|

13 | dotProduct(l_histo, r_histo) / |
---|

14 | Math.sqrt(dotProduct(l_histo, l_histo) * |
---|

15 | dotProduct(r_histo, r_histo)) |
---|

16 | } |
---|

17 | |
---|

18 | static def countNgramFrequency(sequence, degree) { |
---|

19 | def histo = [:] |
---|

20 | def items = sequence.size() |
---|

21 | |
---|

22 | for (int i = 0; i + degree <= items; i++) |
---|

23 | { |
---|

24 | def gram = sequence[i..<(i + degree)] |
---|

25 | histo[gram] = 1 + histo.get(gram, 0) |
---|

26 | } |
---|

27 | histo |
---|

28 | } |
---|

29 | |
---|

30 | static def dotProduct(l_histo, r_histo) { |
---|

31 | def sum = 0 |
---|

32 | l_histo.each { key, value -> |
---|

33 | sum = sum + l_histo[key] * r_histo.get(key, 0) |
---|

34 | } |
---|

35 | sum |
---|

36 | } |
---|

37 | |
---|

38 | static def stringSimilarity (l_str, r_str, degree=2) { |
---|

39 | |
---|

40 | similarity(l_str.toString().toLowerCase().toCharArray(), |
---|

41 | r_str.toString().toLowerCase().toCharArray(), |
---|

42 | degree) |
---|

43 | } |
---|

44 | |
---|

45 | static def mostSimilar(pattern, candidates, threshold=0) { |
---|

46 | def topScore = 0 |
---|

47 | def bestFit = null |
---|

48 | |
---|

49 | candidates.each { candidate -> |
---|

50 | def score = stringSimilarity(pattern, candidate) |
---|

51 | |
---|

52 | if (score > topScore) { |
---|

53 | topScore = score |
---|

54 | bestFit = candidate |
---|

55 | } |
---|

56 | } |
---|

57 | |
---|

58 | if (topScore < threshold) |
---|

59 | bestFit = null |
---|

60 | |
---|

61 | bestFit |
---|

62 | } |
---|

63 | |
---|

64 | static def mostSimilarWithIndex(pattern, candidates, threshold=0) { |
---|

65 | def topScore = 0 |
---|

66 | def bestFit = null |
---|

67 | |
---|

68 | candidates.eachWithIndex { candidate, idx -> |
---|

69 | def score = stringSimilarity(pattern, candidate) |
---|

70 | |
---|

71 | if (score > topScore) { |
---|

72 | topScore = score |
---|

73 | bestFit = idx |
---|

74 | } |
---|

75 | } |
---|

76 | |
---|

77 | if (topScore < threshold) |
---|

78 | bestFit = null |
---|

79 | |
---|

80 | bestFit |
---|

81 | } |
---|

82 | |
---|

83 | // </FUZZY MATCHING> |
---|

84 | } |
---|