1 | package nl.tno.metagenomics.imports |
---|
2 | |
---|
3 | class FuzzySearchService { |
---|
4 | |
---|
5 | static transactional = true |
---|
6 | |
---|
7 | /** |
---|
8 | * Matches the patterns with the candidates, and returns the best candidates for all patterns, but returning |
---|
9 | * the candidates only once |
---|
10 | * |
---|
11 | * @param patterns List with patterns to search for |
---|
12 | * @param candidates List with candidates to search in |
---|
13 | * @param treshold Treshold the matches have to be above |
---|
14 | * @return |
---|
15 | */ |
---|
16 | static def mostSimilarUnique( patterns, candidates, treshold ) { |
---|
17 | def matches = [] |
---|
18 | |
---|
19 | // Find the best matching candidate for each pattern |
---|
20 | patterns.each { pattern -> |
---|
21 | def topScore = 0 |
---|
22 | def bestFit = null |
---|
23 | |
---|
24 | candidates.each { candidate -> |
---|
25 | def score = stringSimilarity(pattern, candidate); |
---|
26 | if( score >= treshold ) |
---|
27 | matches << [ 'pattern': pattern, 'candidate': candidate, 'score': score ]; |
---|
28 | } |
---|
29 | } |
---|
30 | |
---|
31 | // Sort the list on descending score |
---|
32 | matches = matches.sort( { a, b -> b.score <=> a.score } as Comparator ) |
---|
33 | |
---|
34 | // Loop through the scores and select the best matching for every candidate |
---|
35 | def results = patterns.collect { [ 'pattern': it, 'candidate': null ] } |
---|
36 | def selectedCandidates = []; |
---|
37 | def filledPatterns = []; |
---|
38 | |
---|
39 | matches.each { match -> |
---|
40 | if( !filledPatterns.contains( match.pattern ) && !selectedCandidates.contains( match.candidate ) ) { |
---|
41 | results.find { result -> result.pattern == match.pattern }?.candidate = match.candidate; |
---|
42 | |
---|
43 | selectedCandidates << match.candidate; |
---|
44 | filledPatterns << match.pattern; |
---|
45 | } |
---|
46 | } |
---|
47 | |
---|
48 | return results |
---|
49 | } |
---|
50 | |
---|
51 | // classes for fuzzy string matching |
---|
52 | // <FUZZY MATCHING> |
---|
53 | static def similarity(l_seq, r_seq, degree=2) { |
---|
54 | def l_histo = countNgramFrequency(l_seq, degree) |
---|
55 | def r_histo = countNgramFrequency(r_seq, degree) |
---|
56 | |
---|
57 | dotProduct(l_histo, r_histo) / |
---|
58 | Math.sqrt(dotProduct(l_histo, l_histo) * |
---|
59 | dotProduct(r_histo, r_histo)) |
---|
60 | } |
---|
61 | |
---|
62 | static def countNgramFrequency(sequence, degree) { |
---|
63 | def histo = [:] |
---|
64 | def items = sequence.size() |
---|
65 | |
---|
66 | for (int i = 0; i + degree <= items; i++) |
---|
67 | { |
---|
68 | def gram = sequence[i..<(i + degree)] |
---|
69 | histo[gram] = 1 + histo.get(gram, 0) |
---|
70 | } |
---|
71 | histo |
---|
72 | } |
---|
73 | |
---|
74 | static def dotProduct(l_histo, r_histo) { |
---|
75 | def sum = 0 |
---|
76 | l_histo.each { key, value -> |
---|
77 | sum = sum + l_histo[key] * r_histo.get(key, 0) |
---|
78 | } |
---|
79 | sum |
---|
80 | } |
---|
81 | |
---|
82 | static def stringSimilarity (l_str, r_str, degree=2) { |
---|
83 | |
---|
84 | similarity(l_str.toString().toLowerCase().toCharArray(), |
---|
85 | r_str.toString().toLowerCase().toCharArray(), |
---|
86 | degree) |
---|
87 | } |
---|
88 | |
---|
89 | static def mostSimilar(pattern, candidates, threshold=0) { |
---|
90 | def topScore = 0 |
---|
91 | def bestFit = null |
---|
92 | |
---|
93 | candidates.each { candidate -> |
---|
94 | def score = stringSimilarity(pattern, candidate) |
---|
95 | |
---|
96 | if (score > topScore) { |
---|
97 | topScore = score |
---|
98 | bestFit = candidate |
---|
99 | } |
---|
100 | } |
---|
101 | |
---|
102 | if (topScore < threshold) |
---|
103 | bestFit = null |
---|
104 | |
---|
105 | bestFit |
---|
106 | } |
---|
107 | |
---|
108 | static def mostSimilarWithIndex(pattern, candidates, threshold=0) { |
---|
109 | def topScore = 0 |
---|
110 | def bestFit = null |
---|
111 | |
---|
112 | candidates.eachWithIndex { candidate, idx -> |
---|
113 | def score = stringSimilarity(pattern, candidate) |
---|
114 | |
---|
115 | if (score > topScore) { |
---|
116 | topScore = score |
---|
117 | bestFit = idx |
---|
118 | } |
---|
119 | } |
---|
120 | |
---|
121 | if (topScore < threshold) |
---|
122 | bestFit = null |
---|
123 | |
---|
124 | bestFit |
---|
125 | } |
---|
126 | |
---|
127 | // </FUZZY MATCHING> |
---|
128 | } |
---|