1 | package nl.tno.massSequencing.imports |
---|
2 | |
---|
3 | class FuzzySearchService { |
---|
4 | |
---|
5 | static transactional = true |
---|
6 | |
---|
7 | /** |
---|
8 | * Matches the patterns with the candidates, and returns the best candidates for all patterns, but returning |
---|
9 | * the candidates only once |
---|
10 | * |
---|
11 | * @param patterns List with patterns to search for |
---|
12 | * @param candidates List with candidates to search in |
---|
13 | * @param treshold Treshold the matches have to be above |
---|
14 | * @return |
---|
15 | */ |
---|
16 | static def mostSimilarUnique( patterns, candidates, treshold ) { |
---|
17 | println "Patterns: " + patterns.findAll { it }; |
---|
18 | println "Candidates: " + candidates; |
---|
19 | |
---|
20 | def matches = [] |
---|
21 | |
---|
22 | // Find the best matching candidate for each pattern |
---|
23 | patterns.findAll { it }.each { pattern -> |
---|
24 | def topScore = 0 |
---|
25 | def bestFit = null |
---|
26 | |
---|
27 | candidates.each { candidate -> |
---|
28 | def score = stringSimilarity(pattern, candidate); |
---|
29 | if( !score.isNaN() && score >= treshold ) |
---|
30 | matches << [ 'pattern': pattern, 'candidate': candidate, 'score': score ]; |
---|
31 | } |
---|
32 | } |
---|
33 | |
---|
34 | // Sort the list on descending score |
---|
35 | matches = matches.sort( { a, b -> b.score <=> a.score } as Comparator ) |
---|
36 | |
---|
37 | // Loop through the scores and select the best matching for every candidate |
---|
38 | def results = patterns.collect { [ 'pattern': it, 'candidate': null ] } |
---|
39 | def selectedCandidates = []; |
---|
40 | def filledPatterns = []; |
---|
41 | |
---|
42 | matches.each { match -> |
---|
43 | if( !filledPatterns.contains( match.pattern ) && !selectedCandidates.contains( match.candidate ) ) { |
---|
44 | results.find { result -> result.pattern == match.pattern }?.candidate = match.candidate; |
---|
45 | |
---|
46 | selectedCandidates << match.candidate; |
---|
47 | filledPatterns << match.pattern; |
---|
48 | } |
---|
49 | } |
---|
50 | |
---|
51 | return results |
---|
52 | } |
---|
53 | |
---|
54 | // classes for fuzzy string matching |
---|
55 | // <FUZZY MATCHING> |
---|
56 | static def similarity(l_seq, r_seq, degree=2) { |
---|
57 | def l_histo = countNgramFrequency(l_seq, degree) |
---|
58 | def r_histo = countNgramFrequency(r_seq, degree) |
---|
59 | |
---|
60 | dotProduct(l_histo, r_histo) / |
---|
61 | Math.sqrt(dotProduct(l_histo, l_histo) * |
---|
62 | dotProduct(r_histo, r_histo)) |
---|
63 | } |
---|
64 | |
---|
65 | static def countNgramFrequency(sequence, degree) { |
---|
66 | def histo = [:] |
---|
67 | def items = sequence.size() |
---|
68 | |
---|
69 | for (int i = 0; i + degree <= items; i++) |
---|
70 | { |
---|
71 | def gram = sequence[i..<(i + degree)] |
---|
72 | histo[gram] = 1 + histo.get(gram, 0) |
---|
73 | } |
---|
74 | histo |
---|
75 | } |
---|
76 | |
---|
77 | static def dotProduct(l_histo, r_histo) { |
---|
78 | def sum = 0 |
---|
79 | l_histo.each { key, value -> |
---|
80 | sum = sum + l_histo[key] * r_histo.get(key, 0) |
---|
81 | } |
---|
82 | sum |
---|
83 | } |
---|
84 | |
---|
85 | static def stringSimilarity (l_str, r_str, degree=2) { |
---|
86 | |
---|
87 | similarity(l_str.toString().toLowerCase().toCharArray(), |
---|
88 | r_str.toString().toLowerCase().toCharArray(), |
---|
89 | degree) |
---|
90 | } |
---|
91 | |
---|
92 | static def mostSimilar(pattern, candidates, threshold=0) { |
---|
93 | def topScore = 0 |
---|
94 | def bestFit = null |
---|
95 | |
---|
96 | candidates.each { candidate -> |
---|
97 | def score = stringSimilarity(pattern, candidate) |
---|
98 | |
---|
99 | if (score > topScore) { |
---|
100 | topScore = score |
---|
101 | bestFit = candidate |
---|
102 | } |
---|
103 | } |
---|
104 | |
---|
105 | if (topScore < threshold) |
---|
106 | bestFit = null |
---|
107 | |
---|
108 | bestFit |
---|
109 | } |
---|
110 | |
---|
111 | static def mostSimilarWithIndex(pattern, candidates, threshold=0) { |
---|
112 | def topScore = 0 |
---|
113 | def bestFit = null |
---|
114 | |
---|
115 | candidates.eachWithIndex { candidate, idx -> |
---|
116 | def score = stringSimilarity(pattern, candidate) |
---|
117 | |
---|
118 | if (score > topScore) { |
---|
119 | topScore = score |
---|
120 | bestFit = idx |
---|
121 | } |
---|
122 | } |
---|
123 | |
---|
124 | if (topScore < threshold) |
---|
125 | bestFit = null |
---|
126 | |
---|
127 | bestFit |
---|
128 | } |
---|
129 | |
---|
130 | // </FUZZY MATCHING> |
---|
131 | } |
---|