Add imports, change arg vectorize_string

Added packages and methods to `import` into the class for it to work. The default argument for `typ` is `source`, and if the `source_strings` and `compare_strings` arguments are different, not changing it in the call to `vectorize_strings` will throw an error.
This commit is contained in:
avimallu
2022-11-12 10:28:55 -06:00
committed by GitHub
parent e8c6948ae1
commit 47efe7a0fb

View File

@@ -1,3 +1,9 @@
import nmslib
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
class fastfuzzy:
def __init__(self, n_gram, n_jobs, source_strings, compare_strings):
@@ -7,9 +13,9 @@ class fastfuzzy:
self.source = source_strings
self.compare = compare_strings
# Input word variables
# Input word variables are converted to their vector representations
self.source_tfidf_matrix = self.vectorize_strings(self.source)
self.compare_tfidf_matrix = self.vectorize_strings(self.compare)
self.compare_tfidf_matrix = self.vectorize_strings(self.compare, "compare")
self.create_index(self.source_tfidf_matrix)
def create_ngrams(self, strings):
@@ -38,6 +44,7 @@ class fastfuzzy:
return self.vectorizer.transform(strings)
def create_index(self, tf_idf_matrix):
"""Create a special index of distance values based on a cosine similarity driven metric distance."""
index = nmslib.init(
method='simple_invindx', space='negdotprod_sparse_fast',
data_type=nmslib.DataType.SPARSE_VECTOR)
@@ -45,7 +52,8 @@ class fastfuzzy:
index.createIndex(print_progress=True)
self.index = index
def query(self, k, return_df =True):
def query(self, k, return_df=True, replace_indices=True):
"""Returns the k nearest neighbours of each word of compared_string"""
query_matrix = self.index.knnQueryBatch(self.compare_tfidf_matrix, k=k, num_threads=self.n_jobs)
compared_string, matched_strings, matched_conf = ([], [], [])
@@ -55,15 +63,19 @@ class fastfuzzy:
matched_strings.append(query_matrix[i][0])
matched_conf.append(query_matrix[i][1])
# Create a stacked NumPy array representation instead of a list
compared_string = np.hstack(compared_string)
matched_strings = np.hstack(matched_strings)
confidence = np.hstack(matched_conf)
if return_df:
return (pd.DataFrame({
"compared_string" : [self.compare[x] for x in compared_string],
"matched_string" : [self.source[x] for x in matched_strings],
"confidence" : confidence})
.loc[lambda x: x.matched_string != x.compared_string])
if replace_indices:
compared_string = [self.compare[x] for x in compared_string]
matched_strings = [self.source[x] for x in matched_strings]
df = (pd.DataFrame({
"compared_string" : compared_string,
"matched_string" : matched_strings,
"confidence" : confidence}))
return df.loc[lambda x: x.compared_string != x.matched_string]
else:
return sparse.coo_matrix((confidence, (compared_string, matched_strings))).tocsr()