Faiss倒排实现
之前提到的暴力搜索虽然使用了并行和指令优化的策略来提升查询性能,但数据一旦变大,效率还是很难达到要求,于是就有了常规搜索中经常出现的倒排索引出现
数据结构
struct IndexIVF: Index {
size_t nlist; ///< number of possible key values
size_t nprobe; ///< number of probes at query time
Index * quantizer; ///< quantizer that maps vectors to inverted lists
/**
* = 0: use the quantizer as index in a kmeans training
* = 1: just pass on the training set to the train() of the quantizer
* = 2: kmeans training on a flat index + add the centroids to the quantizer
*/
char quantizer_trains_alone;
bool own_fields; ///< whether object owns the quantizer
ClusteringParameters cp; ///< to override default clustering params
Index *clustering_index; ///< to override index used during clustering
std::vector < std::vector<long> > ids; ///< Inverted lists for indexes
size_t code_size; ///< code size per vector in bytes
std::vector < std::vector<uint8_t> > codes; // binary codes, size nlist
/// map for direct access to the elements. Enables reconstruct().
bool maintain_direct_map;
std::vector <long> direct_map;
/** The Inverted file takes a quantizer (an Index) on input,
* which implements the function mapping a vector to a list
* identifier. The pointer is borrowed: the quantizer should not
* be deleted while the IndexIVF is in use.
*/
IndexIVF (Index * quantizer, size_t d, size_t nlist,
MetricType metric = METRIC_INNER_PRODUCT);
重点关注
nlist: 倒排表的长度,对应聚类的中心点数
nprobe: 每次查询需要查的倒排list的个数
quantizer:量化器,或者说是码表,存储所有的聚类中心点
cp: 聚类配置(见文档-Faiss聚类实现)
ids: 倒排向量ID表,由嵌套的vector实现,一个二维数组
code_size: 向量的字节长度
codes: 倒排向量code表,向量的code就是向量本身
初始化
faiss::IndexFlatL2 quantizer(d); // the other index
faiss::IndexIVFFlat index(&quantizer, d, nlist, faiss::METRIC_L2);
IndexIVFFlat::IndexIVFFlat (Index * quantizer,
size_t d, size_t nlist, MetricType metric):
IndexIVF (quantizer, d, nlist, metric)
{
code_size = sizeof(float) * d;
}
向量的 code是向量本身 d维float, 所以code size 是sizeof(float) * d
IndexIVF::IndexIVF (Index * quantizer, size_t d, size_t nlist,
MetricType metric):
Index (d, metric),
nlist (nlist),
nprobe (1),
quantizer (quantizer),
quantizer_trains_alone (0),
own_fields (false),
clustering_index (nullptr),
ids (nlist),
maintain_direct_map (false)
{
FAISS_THROW_IF_NOT (d == quantizer->d);
is_trained = quantizer->is_trained && (quantizer->ntotal == nlist);
// Spherical by default if the metric is inner_product
if (metric_type == METRIC_INNER_PRODUCT) {
cp.spherical = true;
}
// here we set a low # iterations because this is typically used
// for large clusterings (nb this is not used for the MultiIndex,
// for which quantizer_trains_alone = true)
cp.niter = 10;
cp.verbose = verbose;
code_size = 0; // let sub-classes set this
codes.resize(nlist);
}
quantizer:量化器
d:向量维度
nlist:聚类个数
MetricType: 向量距离衡量类型:传入的是METRIC_L2
训练
1) 输入
index.train(nb, xb);
nb: 待索引向量的个数
xb: 待索引向量的数组
2) 训练初聚类
Clustering clus (d, nlist, cp);
quantizer->reset();
.....
clus.train (n, x, *quantizer);
.....
n: 待索引向量的个数
x: 待索引向量的数组
关于聚类的细节请看文档 Faiss的聚类实现
最后将所有(nlist个)聚类中心(向量)存储在量化器quantizer中
3) 训练量化残差
void IndexIVF::train_residual(idx_t /*n*/, const float* /*x*/) {
if (verbose)
printf("IndexIVF: no residual training\n");
// does nothing by default
}
啥也不干
所以训练就是一次粗聚类
索引
1) 输入
index.add(nb, xb);
nb: 待索引向量的个数
xb: 待索引向量的数组
2) 遍历找到每个向量最近的聚类中心点
void Index::assign (idx_t n, const float * x, idx_t * labels, idx_t k)
{
float * distances = new float[n * k];
ScopeDeleter<float> del(distances);
search (n, x, k, distances, labels);
}
nb: 待索引向量的个数
xb: 待索引向量的数组
labels: 存放每个向量距离最近的中心点ID
k: 1 (取 top 1)
其中k=1, 即查询最近的一个
使用ScopeDeleter<float> del(distances)
,是因为不需要distances, 只需要labels, 函数退出,distances回收掉,留下最近的聚类中心点id数组labels
search (n, x, k, distances, labels);
的细节见文档 Faiss暴力搜索实现
3) 建立倒排表
3.1) 遍历所有向量
for (size_t i = 0; i < n; i++) {
3.2) 建立倒排id list
long id = xids ? xids[i] : ntotal + i;
long list_no = idx [i];
.....
ids[list_no].push_back (id);
idx是步骤2)返回的那个距离每个向量最近的聚类中心ID 数组
取出向量对应的那个距离最近的聚类中心ID
将向量ID加入到该聚类中心对应的倒排list
3.3) 建立倒排code list
const float *xi = x + i * d;
/* store the vectors */
size_t ofs = codes[list_no].size();
codes[list_no].resize(ofs + code_size);
memcpy(codes[list_no].data() + ofs,xi, code_size);
倒排code存放的就是向量本身(d个float)
搜索
1) 输入
index.search(nq, xq, k, D, I);
nq: 查询向量个数
xq: 查询向量数组
k: 每个查询向量返回个数 top k
D:存放返回的top k个距离
I: 存放返回的top k 个索引向量ID
2) 粗查询(查询聚类中心)
long * idx = new long [n * nprobe];
ScopeDeleter<long> del (idx);
float * coarse_dis = new float [n * nprobe];
ScopeDeleter<float> del2 (coarse_dis);
quantizer->search (n, x, nprobe, coarse_dis, idx);
nprobe: 返回几个最近聚类中心点
quantizer->search (n, x, nprobe, coarse_dis, idx);
实现细节见文档 Faiss暴力搜索实现
搜出查询向量最近的npobe个聚类中心点ID及对应的距离
3) 查询倒排表前的距离类型判断
void IndexIVFFlat::search_preassigned (idx_t n, const float *x, idx_t k,
const idx_t *idx,
const float * /* coarse_dis */,
float *distances, idx_t *labels,
bool store_pairs) const
{
if (metric_type == METRIC_INNER_PRODUCT) {
float_minheap_array_t res = {
size_t(n), size_t(k), labels, distances};
search_knn_inner_product (*this, n, x, idx, &res, store_pairs);
} else if (metric_type == METRIC_L2) {
float_maxheap_array_t res = {
size_t(n), size_t(k), labels, distances};
search_knn_L2sqr (*this, n, x, idx, &res, store_pairs);
}
}
和暴力搜索一样,初始化返回的最大堆,进入实质查询
4) 查询倒排表
void search_knn_L2sqr (const IndexIVFFlat &ivf,
size_t nx,
const float * x,
const long * keys,
float_maxheap_array_t * res,
bool store_pairs)
{
const size_t k = res->k;
size_t nlistv = 0, ndis = 0;
size_t d = ivf.d;
#pragma omp parallel for reduction(+: nlistv, ndis)
for (size_t i = 0; i < nx; i++) {
const float * xi = x + i * d;
const long * keysi = keys + i * ivf.nprobe;
float * __restrict disi = res->get_val (i);
long * __restrict idxi = res->get_ids (i);
maxheap_heapify (k, disi, idxi);
for (size_t ik = 0; ik < ivf.nprobe; ik++) {
long key = keysi[ik]; /* select the list */
if (key < 0) {
// not enough centroids for multiprobe
continue;
}
FAISS_THROW_IF_NOT_FMT (
key < (long) ivf.nlist,
"Invalid key=%ld at ik=%ld nlist=%ld\n",
key, ik, ivf.nlist);
nlistv++;
const size_t list_size = ivf.ids[key].size();
const float * list_vecs = (const float*)(ivf.codes[key].data());
for (size_t j = 0; j < list_size; j++) {
const float * yj = list_vecs + d * j;
float disij = fvec_L2sqr (xi, yj, d);
if (disij < disi[0]) {
maxheap_pop (k, disi, idxi);
long id = store_pairs ? (key << 32 | j) : ivf.ids[key][j];
maxheap_push (k, disi, idxi, disij, id);
}
}
ndis += list_size;
}
maxheap_reorder (k, disi, idxi);
}
indexIVFFlat_stats.nq += nx;
indexIVFFlat_stats.nlist += nlistv;
indexIVFFlat_stats.ndis += ndis;
}
本段代码写的极为精简,所以copy过来慢慢讲,和暴力搜索过程极为类似
4.1) 初始化统计变量
size_t nlistv = 0, ndis = 0;
nlistv : 统计遍历的倒排list个数
ndis: 统计比对过的向量个数
4.2) 开启openmp
#pragma omp parallel for reduction(+: nlistv, ndis)
for (size_t i = 0; i < nx; i++) {
累计各线程计算的nlistv和ndis, 形成全局的nlistv和ndis
开启并行的查询
4.3) 准备好当前查询的变量
const float * xi = x + i * d;
const long * keysi = keys + i * ivf.nprobe;
float * __restrict disi = res->get_val (i);
long * __restrict idxi = res->get_ids (i);
maxheap_heapify (k, disi, idxi);
定位到当前的查询向量
定位到当前查询向量对应的粗聚类中心ID
定位到当前查询对应的返回最大堆
4.4) 开始遍历top k 个粗聚类中心
for (size_t ik = 0; ik < ivf.nprobe; ik++) {
4.5) 确定但前的聚类中心ID
long key = keysi[ik]; /* select the list */
if (key < 0) {
// not enough centroids for multiprobe
continue;
}
FAISS_THROW_IF_NOT_FMT (
key < (long) ivf.nlist,
"Invalid key=%ld at ik=%ld nlist=%ld\n",
key, ik, ivf.nlist);
nlistv++;
获取当前对应的聚类中心ID并校验,然后累加nlistv (一个聚类中心点对应一个倒排list)
4.6) 核心的计算
const size_t list_size = ivf.ids[key].size();
const float * list_vecs = (const float*)(ivf.codes[key].data());
for (size_t j = 0; j < list_size; j++) {
const float * yj = list_vecs + d * j;
float disij = fvec_L2sqr (xi, yj, d);
if (disij < disi[0]) {
maxheap_pop (k, disi, idxi);
long id = store_pairs ? (key << 32 | j) : ivf.ids[key][j];
maxheap_push (k, disi, idxi, disij, id);
}
}
ndis += list_size;
获取当前聚类中心对应的倒排list
遍历倒排list的每个向量
和查询向量一一计算距离fvec_L2sqr (xi, yj, d)
细节见 文档 Faiss暴力搜索实现
将结果放入返回的最大堆
最后累加比对的向量的个数
4.7) 最大堆排序
maxheap_reorder (k, disi, idxi);
细节见文档 Faiss暴力搜索实现
4.8) 更新统计
indexIVFFlat_stats.nq += nx;
indexIVFFlat_stats.nlist += nlistv;
indexIVFFlat_stats.ndis += ndis;
小结
在暴力搜索的基础上,通过聚类将索引数据进行了分片,
和查询和暴力搜索相比,通过代价很小的粗查询(只需要比较少量的聚类中心点)迅速定位到某些分片,
使得计算向量距离的次数大大缩小,从而提升了性能