| 509 | // classes for fuzzy string matching |

| 510 | // <FUZZY MATCHING> |

| 511 | static def similarity(l_seq, r_seq, degree=2) { |

| 512 | def l_histo = countNgramFrequency(l_seq, degree) |

| 513 | def r_histo = countNgramFrequency(r_seq, degree) |

| 514 | |

| 515 | dotProduct(l_histo, r_histo) / |

| 516 | Math.sqrt(dotProduct(l_histo, l_histo) * |

| 517 | dotProduct(r_histo, r_histo)) |

| 518 | } |

| 519 | |

| 520 | static def countNgramFrequency(sequence, degree) { |

| 521 | def histo = [:] |

| 522 | def items = sequence.size() |

| 523 | |

| 524 | for (int i = 0; i + degree <= items; i++) |

| 525 | { |

| 526 | def gram = sequence[i..<(i + degree)] |

| 527 | histo[gram] = 1 + histo.get(gram, 0) |

| 528 | } |

| 529 | histo |

| 530 | } |

| 531 | |

| 532 | static def dotProduct(l_histo, r_histo) { |

| 533 | def sum = 0 |

| 534 | l_histo.each { key, value -> |

| 535 | sum = sum + l_histo[key] * r_histo.get(key, 0) |

| 536 | } |

| 537 | sum |

| 538 | } |

| 539 | |

| 540 | static def stringSimilarity (l_str, r_str, degree=2) { |

| 541 | similarity(l_str.toLowerCase().toCharArray(), |

| 542 | r_str.toLowerCase().toCharArray(), |

| 543 | degree) |

| 544 | } |

| 545 | |

| 546 | static def mostSimilar(pattern, candidates, threshold=0) { |

| 547 | def topScore = 0 |

| 548 | def bestFit = null |

| 549 | |

| 550 | candidates.each { candidate -> |

| 551 | def score = stringSimilarity(pattern, candidate) |

| 552 | if (score > topScore) { |

| 553 | topScore = score |

| 554 | bestFit = candidate |

| 555 | } |

| 556 | } |

| 557 | |

| 558 | if (topScore < threshold) |

| 559 | bestFit = null |

| 560 | |

| 561 | bestFit |

| 562 | } |

| 563 | // </FUZZY MATCHING> |

| 564 | |