Add imports, change arg vectorize_string
Added packages and methods to `import` into the class for it to work. The default argument for `typ` is `source`, and if the `source_strings` and `compare_strings` arguments are different, not changing it in the call to `vectorize_strings` will throw an error.
This commit is contained in:
@@ -1,3 +1,9 @@
|
||||
import nmslib
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from scipy import sparse
|
||||
|
||||
class fastfuzzy:
|
||||
|
||||
def __init__(self, n_gram, n_jobs, source_strings, compare_strings):
|
||||
@@ -7,9 +13,9 @@ class fastfuzzy:
|
||||
self.source = source_strings
|
||||
self.compare = compare_strings
|
||||
|
||||
# Input word variables
|
||||
# Input word variables are converted to their vector representations
|
||||
self.source_tfidf_matrix = self.vectorize_strings(self.source)
|
||||
self.compare_tfidf_matrix = self.vectorize_strings(self.compare)
|
||||
self.compare_tfidf_matrix = self.vectorize_strings(self.compare, "compare")
|
||||
self.create_index(self.source_tfidf_matrix)
|
||||
|
||||
def create_ngrams(self, strings):
|
||||
@@ -38,6 +44,7 @@ class fastfuzzy:
|
||||
return self.vectorizer.transform(strings)
|
||||
|
||||
def create_index(self, tf_idf_matrix):
|
||||
"""Create a special index of distance values based on a cosine similarity driven metric distance."""
|
||||
index = nmslib.init(
|
||||
method='simple_invindx', space='negdotprod_sparse_fast',
|
||||
data_type=nmslib.DataType.SPARSE_VECTOR)
|
||||
@@ -45,7 +52,8 @@ class fastfuzzy:
|
||||
index.createIndex(print_progress=True)
|
||||
self.index = index
|
||||
|
||||
def query(self, k, return_df =True):
|
||||
def query(self, k, return_df=True, replace_indices=True):
|
||||
"""Returns the k nearest neighbours of each word of compared_string"""
|
||||
query_matrix = self.index.knnQueryBatch(self.compare_tfidf_matrix, k=k, num_threads=self.n_jobs)
|
||||
compared_string, matched_strings, matched_conf = ([], [], [])
|
||||
|
||||
@@ -55,15 +63,19 @@ class fastfuzzy:
|
||||
matched_strings.append(query_matrix[i][0])
|
||||
matched_conf.append(query_matrix[i][1])
|
||||
|
||||
# Create a stacked NumPy array representation instead of a list
|
||||
compared_string = np.hstack(compared_string)
|
||||
matched_strings = np.hstack(matched_strings)
|
||||
confidence = np.hstack(matched_conf)
|
||||
|
||||
if return_df:
|
||||
return (pd.DataFrame({
|
||||
"compared_string" : [self.compare[x] for x in compared_string],
|
||||
"matched_string" : [self.source[x] for x in matched_strings],
|
||||
"confidence" : confidence})
|
||||
.loc[lambda x: x.matched_string != x.compared_string])
|
||||
if replace_indices:
|
||||
compared_string = [self.compare[x] for x in compared_string]
|
||||
matched_strings = [self.source[x] for x in matched_strings]
|
||||
df = (pd.DataFrame({
|
||||
"compared_string" : compared_string,
|
||||
"matched_string" : matched_strings,
|
||||
"confidence" : confidence}))
|
||||
return df.loc[lambda x: x.compared_string != x.matched_string]
|
||||
else:
|
||||
return sparse.coo_matrix((confidence, (compared_string, matched_strings))).tocsr()
|
||||
|
||||
Reference in New Issue
Block a user