[29] | 1 | package nl.tno.massSequencing.imports |
---|
[2] | 2 | |
---|
| 3 | class FuzzySearchService { |
---|
| 4 | |
---|
| 5 | static transactional = true |
---|
[21] | 6 | |
---|
| 7 | /** |
---|
| 8 | * Matches the patterns with the candidates, and returns the best candidates for all patterns, but returning |
---|
| 9 | * the candidates only once |
---|
| 10 | * |
---|
| 11 | * @param patterns List with patterns to search for |
---|
| 12 | * @param candidates List with candidates to search in |
---|
| 13 | * @param treshold Treshold the matches have to be above |
---|
| 14 | * @return |
---|
| 15 | */ |
---|
| 16 | static def mostSimilarUnique( patterns, candidates, treshold ) { |
---|
[52] | 17 | println "Patterns: " + patterns.findAll { it }; |
---|
| 18 | println "Candidates: " + candidates; |
---|
| 19 | |
---|
[21] | 20 | def matches = [] |
---|
| 21 | |
---|
| 22 | // Find the best matching candidate for each pattern |
---|
[52] | 23 | patterns.findAll { it }.each { pattern -> |
---|
[21] | 24 | def topScore = 0 |
---|
| 25 | def bestFit = null |
---|
| 26 | |
---|
| 27 | candidates.each { candidate -> |
---|
| 28 | def score = stringSimilarity(pattern, candidate); |
---|
[52] | 29 | if( !score.isNaN() && score >= treshold ) |
---|
[21] | 30 | matches << [ 'pattern': pattern, 'candidate': candidate, 'score': score ]; |
---|
| 31 | } |
---|
| 32 | } |
---|
| 33 | |
---|
| 34 | // Sort the list on descending score |
---|
| 35 | matches = matches.sort( { a, b -> b.score <=> a.score } as Comparator ) |
---|
| 36 | |
---|
| 37 | // Loop through the scores and select the best matching for every candidate |
---|
| 38 | def results = patterns.collect { [ 'pattern': it, 'candidate': null ] } |
---|
| 39 | def selectedCandidates = []; |
---|
| 40 | def filledPatterns = []; |
---|
| 41 | |
---|
| 42 | matches.each { match -> |
---|
| 43 | if( !filledPatterns.contains( match.pattern ) && !selectedCandidates.contains( match.candidate ) ) { |
---|
| 44 | results.find { result -> result.pattern == match.pattern }?.candidate = match.candidate; |
---|
| 45 | |
---|
| 46 | selectedCandidates << match.candidate; |
---|
| 47 | filledPatterns << match.pattern; |
---|
| 48 | } |
---|
| 49 | } |
---|
| 50 | |
---|
| 51 | return results |
---|
| 52 | } |
---|
[2] | 53 | |
---|
| 54 | // classes for fuzzy string matching |
---|
| 55 | // <FUZZY MATCHING> |
---|
| 56 | static def similarity(l_seq, r_seq, degree=2) { |
---|
| 57 | def l_histo = countNgramFrequency(l_seq, degree) |
---|
| 58 | def r_histo = countNgramFrequency(r_seq, degree) |
---|
| 59 | |
---|
| 60 | dotProduct(l_histo, r_histo) / |
---|
| 61 | Math.sqrt(dotProduct(l_histo, l_histo) * |
---|
| 62 | dotProduct(r_histo, r_histo)) |
---|
| 63 | } |
---|
| 64 | |
---|
| 65 | static def countNgramFrequency(sequence, degree) { |
---|
| 66 | def histo = [:] |
---|
| 67 | def items = sequence.size() |
---|
| 68 | |
---|
| 69 | for (int i = 0; i + degree <= items; i++) |
---|
| 70 | { |
---|
| 71 | def gram = sequence[i..<(i + degree)] |
---|
| 72 | histo[gram] = 1 + histo.get(gram, 0) |
---|
| 73 | } |
---|
| 74 | histo |
---|
| 75 | } |
---|
| 76 | |
---|
| 77 | static def dotProduct(l_histo, r_histo) { |
---|
| 78 | def sum = 0 |
---|
| 79 | l_histo.each { key, value -> |
---|
| 80 | sum = sum + l_histo[key] * r_histo.get(key, 0) |
---|
| 81 | } |
---|
| 82 | sum |
---|
| 83 | } |
---|
| 84 | |
---|
| 85 | static def stringSimilarity (l_str, r_str, degree=2) { |
---|
| 86 | |
---|
| 87 | similarity(l_str.toString().toLowerCase().toCharArray(), |
---|
| 88 | r_str.toString().toLowerCase().toCharArray(), |
---|
| 89 | degree) |
---|
| 90 | } |
---|
| 91 | |
---|
| 92 | static def mostSimilar(pattern, candidates, threshold=0) { |
---|
| 93 | def topScore = 0 |
---|
| 94 | def bestFit = null |
---|
| 95 | |
---|
| 96 | candidates.each { candidate -> |
---|
| 97 | def score = stringSimilarity(pattern, candidate) |
---|
| 98 | |
---|
| 99 | if (score > topScore) { |
---|
| 100 | topScore = score |
---|
| 101 | bestFit = candidate |
---|
| 102 | } |
---|
| 103 | } |
---|
| 104 | |
---|
| 105 | if (topScore < threshold) |
---|
| 106 | bestFit = null |
---|
| 107 | |
---|
| 108 | bestFit |
---|
| 109 | } |
---|
| 110 | |
---|
| 111 | static def mostSimilarWithIndex(pattern, candidates, threshold=0) { |
---|
| 112 | def topScore = 0 |
---|
| 113 | def bestFit = null |
---|
| 114 | |
---|
| 115 | candidates.eachWithIndex { candidate, idx -> |
---|
| 116 | def score = stringSimilarity(pattern, candidate) |
---|
| 117 | |
---|
| 118 | if (score > topScore) { |
---|
| 119 | topScore = score |
---|
| 120 | bestFit = idx |
---|
| 121 | } |
---|
| 122 | } |
---|
| 123 | |
---|
| 124 | if (topScore < threshold) |
---|
| 125 | bestFit = null |
---|
| 126 | |
---|
| 127 | bestFit |
---|
| 128 | } |
---|
| 129 | |
---|
| 130 | // </FUZZY MATCHING> |
---|
| 131 | } |
---|