!pip install faiss
!pip install faiss-gpu
import numpy as np
import faiss
import torch
from google.colab import drive
import pandas as pd
import re
drive.mount('/content/drive')
# load saved embeddings
embeddings = np.load('/content/drive/My Drive/Colab Notebooks/embeddings/semantic_search.npy')
# load df for reference
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/embeddings/test_export.csv')
# k-NN search; Euclidean Distance
# build a CPU index; flat = brute force search
cpu_index = faiss.IndexFlatL2(embeddings.shape[1])
# use the Google GPU
n_gpu = 1
# send index to GPU
gpu_index = faiss.index_cpu_to_all_gpus(cpu_index, ngpu=1)
# insert our embeddings
gpu_index.add(embeddings)
# find 5 most similar documents to 234
distances, indices = gpu_index.search(embeddings[234].reshape(1, 768), k=5)
for i in range(indices.shape[1]):
result_i = indices[0, i]
text = df.iloc[result_i].body
print('Comment #:', result_i)
print('L2 Distance', distances[0, i])
print('"' + 'intentionally blank' + '"')
print('')