@@ -33,89 +33,35 @@ inline ui32 LevenshteinDistance(TString word1, TString word2) {
3333 return dist[size1][size2];
3434}
3535
36- template <typename Type>
3736class FuzzySearcher {
38- struct WordHit {
39- bool Contains;
40- ui32 LengthDifference;
41- ui32 LevenshteinDistance;
42- Type Data;
43-
44- WordHit (bool contains, ui32 lengthDifference, ui32 levenshteinDistance, Type data)
45- : Contains(contains)
46- , LengthDifference(lengthDifference)
47- , LevenshteinDistance(levenshteinDistance)
48- , Data(data)
49- {}
50-
51- bool operator <(const WordHit& other) const {
52- if (this ->Contains && !other.Contains ) {
53- return true ;
54- }
55- if (this ->Contains && other.Contains ) {
56- return this ->LengthDifference < other.LengthDifference ;
57- }
58- return this ->LevenshteinDistance < other.LevenshteinDistance ;
59- }
60-
61- bool operator >(const WordHit& other) const {
62- if (!this ->Contains && other.Contains ) {
63- return true ;
64- }
65- if (this ->Contains && other.Contains ) {
66- return this ->LengthDifference > other.LengthDifference ;
67- }
68- return this ->LevenshteinDistance > other.LevenshteinDistance ;
69- }
70- };
71-
72- static WordHit CalculateWordHit (TString searchWord, TString testWord, Type testData) {
73- searchWord = to_lower (searchWord);
74- testWord = to_lower (testWord);
75- if (testWord.Contains (searchWord)) {
76- return {1 , static_cast <ui32>(testWord.length () - searchWord.length ()), 0 , testData};
37+ static size_t CalculateWordHit (const TString& searchWord, const TString& testWord) {
38+ size_t findPos = testWord.find (searchWord);
39+ if (findPos != TString::npos) {
40+ return testWord.size () - searchWord.size () + findPos;
7741 } else {
78- ui32 levenshteinDistance = LevenshteinDistance (searchWord, testWord);
79- return {0 , 0 , levenshteinDistance, testData};
42+ return 1000 * LevenshteinDistance (searchWord, testWord);
8043 }
8144 }
8245
8346public:
84- THashMap<TString, Type> Dictionary;
85-
86- FuzzySearcher (const THashMap<TString, Type>& dictionary)
87- : Dictionary(dictionary) {}
88-
89- FuzzySearcher (const TVector<TString>& words) {
90- for (const auto & word : words) {
91- Dictionary[word] = word;
47+ template <typename Type>
48+ static std::vector<const Type*> Search (const std::vector<Type>& dictionary, const TString& searchWord, ui32 limit = 10 ) {
49+ TString search = to_lower (searchWord);
50+ std::vector<std::pair<size_t , size_t >> hits; // {distance, index}
51+ hits.reserve (dictionary.size ());
52+ for (size_t index = 0 ; index < dictionary.size (); ++index) {
53+ hits.emplace_back (CalculateWordHit (search, to_lower (TString (dictionary[index]))), index);
9254 }
93- }
94-
95- TVector<Type> Search (const TString& searchWord, ui32 limit = 10 ) {
96- auto cmp = [](const WordHit& left, const WordHit& right) {
97- return left < right;
98- };
99- std::priority_queue<WordHit, TVector<WordHit>, decltype (cmp)> queue (cmp);
100-
101- for (const auto & [word, data]: Dictionary) {
102- auto wordHit = CalculateWordHit (searchWord, word, data);
103- if (queue.size () < limit) {
104- queue.emplace (wordHit);
105- } else if (queue.size () > 0 && wordHit < queue.top ()) {
106- queue.pop ();
107- queue.emplace (wordHit);
108- }
55+ std::sort (hits.begin (), hits.end ());
56+ if (hits.size () > limit) {
57+ hits.resize (limit);
10958 }
110-
111- TVector<Type> results;
112- while (!queue.empty ()) {
113- results.emplace_back (queue.top ().Data );
114- queue.pop ();
59+ std::vector<const Type*> result;
60+ result.reserve (hits.size ());
61+ for (const auto & hit : hits) {
62+ result.emplace_back (&dictionary[hit.second ]);
11563 }
116-
117- std::reverse (results.begin (), results.end ());
118- return results;
64+ return result;
11965 }
12066};
12167
0 commit comments