结合全文索引和标量索引

Tim*_*Tim 8 index sql-server full-text-search sql-server-2012

假设我们有一个包含 1200 万个姓名和地址的数据库,需要使用全文进行搜索,但每一行也包含一个整数值,比方说COMPANYID。该表在这 1200 万行中包含大约 250 个不同的 COMPANYID。

在定义全文索引时,是否有可能COMPANY在树中为每个索引提供自己的“分支”?

wBo*_*Bob 3

简短的回答是“不”,而且您并不真正需要这个。全文索引是倒排索引,因此它们通过创建全文索引时必须指定的唯一 doc_id 存储分割词。这必须是“唯一、单键、不可为空的列”,最好是整数。本质上外键的内容没有被计算出来,并且没有简单的方法可以在此基础上对它们进行分区。

可以使用每个公司一个表和每个表的全文索引来欺骗类似的东西。您需要某种代码逻辑位于前面来确定要插入到哪个表/从中获取。这将是一个相当令人头疼的管理问题,而且几乎肯定不值得。

如果您的数据量很大(例如,更像 230 亿条记录),那么您可以考虑分片解决方案,例如,每个公司都有一个 Azure VM,前面有一个应用程序,以确定要连接到哪台机器。但显然你也不需要那个。

SQL 2008 还对全文进行了许多改进,现在更加集成到数据库引擎中。一种场景是针对普通列指定 WHERE 子句并使用全文函数,这种场景称为“混合查询”,并在此处进行讨论。尽管该信息适用于 SQL 2008,但这仍然是一篇很棒的文章。

如果您通常关心性能和计划,为什么不启动一些测试数据,引入一些偏差并尝试一下。我在几分钟内就完成了这个包含大约 200 万行的脚本:

!!TODO introduce some skew
USE master
GO

SET NOCOUNT ON
GO

DBCC TRACEON(610)   -- Minimal logging
GO

GO

IF EXISTS ( SELECT * FROM sys.databases WHERE name = 'fullTextDemo' )
BEGIN
    ALTER DATABASE fullTextDemo SET SINGLE_USER WITH ROLLBACK IMMEDIATE
    DROP DATABASE fullTextDemo
END
GO

IF NOT EXISTS ( SELECT * FROM sys.databases WHERE name = 'fullTextDemo' )
CREATE DATABASE fullTextDemo
GO

ALTER DATABASE fullTextDemo SET RECOVERY SIMPLE
GO

USE fullTextDemo
GO

IF OBJECT_ID('dbo.yourAddresses') IS NOT NULL DROP TABLE dbo.yourAddresses
IF OBJECT_ID('dbo.companies') IS NOT NULL DROP TABLE dbo.companies
GO

CREATE TABLE dbo.companies (
    companyId       INT IDENTITY NOT NULL,
    companyName     NVARCHAR(50) NOT NULL,

    CONSTRAINT PK_companies PRIMARY KEY ( companyId )
)
GO

CREATE TABLE dbo.yourAddresses (
    rowId           INT IDENTITY,
    companyId       INT NOT NULL FOREIGN KEY REFERENCES dbo.companies ( companyId ),
    searchTerms     NVARCHAR(2048) NOT NULL

    CONSTRAINT PK_yourAddresses PRIMARY KEY ( rowId )
)
GO

-- Populate the companies
;WITH cte AS (
SELECT TOP 250 ROW_NUMBER() OVER ( ORDER BY ( SELECT 1 ) ) rn
FROM master.sys.columns c1
    CROSS JOIN master.sys.columns c2
    CROSS JOIN master.sys.columns c3
)
INSERT INTO dbo.companies ( companyName )
SELECT NEWID()
FROM cte
GO

-- Generate 2,636,000 records
INSERT dbo.yourAddresses ( companyId, searchTerms )
SELECT c.companyId, m.[text]
FROM dbo.companies c
    CROSS JOIN ( SELECT * FROM sys.messages ) m
WHERE m.language_id = 1033
AND m.[text] Like '[a-z]%'
GO

CREATE INDEX _idx ON dbo.yourAddresses ( companyId ) INCLUDE ( searchTerms )
GO

-- !!TODO look at compression
--ALTER INDEX PK_yourAddresses ON dbo.yourAddresses REBUILD WITH ( DATA_COMPRESSION = PAGE )
--GO

-- Create the catalog
IF NOT EXISTS ( SELECT * FROM sys.fulltext_catalogs WHERE name = N'ftc_yourAddresses' )
CREATE FULLTEXT CATALOG ftc_yourAddresses
GO

-- Create the full-text index
CREATE FULLTEXT INDEX ON dbo.yourAddresses ( searchTerms ) KEY INDEX PK_yourAddresses ON ftc_yourAddresses WITH CHANGE_TRACKING MANUAL  -- CHANGE_TRACKING OFF, NO POPULATION
GO

SELECT 'before' ft, * FROM sys.fulltext_indexes
GO

ALTER FULLTEXT INDEX ON dbo.yourAddresses START FULL POPULATION;
GO


DECLARE @i INT 
SET @i = 0

WHILE EXISTS ( SELECT * FROM sys.fulltext_indexes WHERE has_crawl_completed = 0 )
BEGIN

        SELECT outstanding_batch_count, *
        FROM sys.dm_fts_index_population
        WHERE database_id = DB_ID()

        --SELECT *
        --FROM sys.dm_fts_outstanding_batches
        --WHERE database_id = DB_ID()

    WAITFOR DELAY '00:00:05'

    SET @i = @i + 1
    IF @i > 60 BEGIN RAISERROR( 'Too many loops!', 16, 1 ) BREAK END

END

SELECT 'after' ft, * FROM sys.fulltext_indexes
GO



SELECT TOP 1000 *
FROM dbo.yourAddresses ft
WHERE companyId = 42
 AND CONTAINS ( searchTerms, 'data' )
GO

SELECT TOP 1000 *
FROM dbo.yourAddresses a
    INNER JOIN CONTAINSTABLE ( dbo.yourAddresses, searchTerms, 'data' ) ct ON a.rowId = ct.[key]
WHERE a.companyId = 42
GO

SELECT TOP 1000 *
FROM dbo.yourAddresses a
    INNER JOIN CONTAINSTABLE ( dbo.yourAddresses, searchTerms, 'data' ) ct ON a.rowId = ct.[key]
WHERE a.companyId = 42
OPTION ( MERGE JOIN )
GO

SELECT TOP 100 *
FROM sys.dm_fts_index_keywords (DB_ID(), OBJECT_ID('dbo.yourAddresses') )

SELECT TOP 100 *
FROM sys.dm_fts_index_keywords_by_document(DB_ID(), OBJECT_ID('dbo.yourAddresses') )
ORDER BY document_id
GO
Run Code Online (Sandbox Code Playgroud)