def brute_force_search(text_vectors, pattern_vectors, threshold=0.5):
    n = len(text_vectors)
    m = len(pattern_vectors)
    # 将输入转换为NumPy数组
    text_vectors = np.array(text_vectors).astype('float32')
    pattern_vectors = np.array(pattern_vectors).astype('float32')

    # 创建一个 Faiss 索引
    d = text_vectors.shape[1]  # 向量的维度
    index = faiss.IndexFlatIP(d)  # 使用内积(余弦相似度)作为度量
    index.add(text_vectors)

    # 计算模式向量的范数
    pattern_norms = np.linalg.norm(pattern_vectors, axis=1)
    # 初始化匹配列表
    matches = []

    # 遍历模式向量
    for i in range(m):
        # 计算模式向量的范数
        pattern_norm = pattern_norms[i]
        # 归一化模式向量
        normalized_pattern = pattern_vectors[i] / pattern_norm
        # 使用 Faiss 进行搜索
        D, I = index.search(np.expand_dims(normalized_pattern, axis=0), n)
        # D 是相似度,I 是索引
        similarities = D[0]
        indices = I[0]

        # 找到相似度大于等于阈值的匹配
        for j in range(n):
            if similarities[j] >= threshold:
                matches.append((indices[j], similarities[j]))

    # 按相似度降序排序
    matches.sort(key=lambda x: x[1], reverse=True)

    # 输出匹配结果
    for match in matches:
        print(f"Pattern found at index {match[0]} with similarity {match[1]:.4f}")

运行报错

OMP: Error #15: Initializing libomp140.x86_64.dll, but found libiomp5md.dll already initialized.
OMP: Hint This means that multiple copies of the OpenMP runtime have been linked into the program. That is dangerous, since it can degrade performance or cause incorrect results. The best thing to do is to ensure that only a single OpenMP runtime is linked into the process, e.g. by avoiding static linking of the OpenMP runtime in any library. As an unsafe, unsupported, undocumented workaround you can set the environment variable KMP_DUPLICATE_LIB_OK=TRUE to allow the program to continue to execute, but that may cause crashes or silently produce incorrect results. For more information, please see http://openmp.llvm.org/

解决方案导入sklearn包

import sklearn

Logo

腾讯云面向开发者汇聚海量精品云计算使用和开发经验,营造开放的云计算技术生态圈。

更多推荐