In [ ]:
!pip install faiss
!pip install faiss-gpu
import numpy as np
import faiss
import torch
from google.colab import drive
import pandas as pd
import re
drive.mount('/content/drive')
In [7]:
# load saved embeddings
embeddings = np.load('/content/drive/My Drive/Colab Notebooks/embeddings/semantic_search.npy')
In [8]:
# load df for reference
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/embeddings/test_export.csv')
In [9]:
# k-NN search; Euclidean Distance

# build a CPU index; flat = brute force search
cpu_index = faiss.IndexFlatL2(embeddings.shape[1])

# use the Google GPU
n_gpu = 1

# send index to GPU
gpu_index = faiss.index_cpu_to_all_gpus(cpu_index, ngpu=1)

# insert our embeddings
gpu_index.add(embeddings)
In [13]:
# find 5 most similar documents to 234
distances, indices = gpu_index.search(embeddings[234].reshape(1, 768), k=5)
In [18]:
for i in range(indices.shape[1]):
    result_i = indices[0, i]

    text = df.iloc[result_i].body

    print('Comment #:', result_i)
    print('L2 Distance', distances[0, i])
    print('"' + 'intentionally blank' + '"')
    print('')
Comment #: 234
L2 Distance 0.0005493164
"intentionally blank"

Comment #: 1176
L2 Distance 78.74457
"intentionally blank"

Comment #: 240
L2 Distance 98.252625
"intentionally blank"

Comment #: 2934
L2 Distance 104.385284
"intentionally blank"

Comment #: 8608
L2 Distance 104.50043
"intentionally blank"