pal*_*uin 6 performance sql-server t-sql like query-performance
考虑这些简化的表:
CREATE TABLE dbo.words
(
id bigint NOT NULL IDENTITY (1, 1),
word varchar(32) NOT NULL,
hits int NULL
)
CREATE TABLE dbo.items
(
id bigint NOT NULL IDENTITY (1, 1),
body varchar(256) NOT NULL,
)
Run Code Online (Sandbox Code Playgroud)
该words表包含大约 9000 条记录,每个记录包含一个单词('phone'、'sofa'、'house'、'dog'...) 该items表包含大约 12000 条记录,每条记录的正文不超过 256人物。
现在,我需要更新words表,计算表中有多少记录items保存(至少一次)单词字段中的文本。我需要考虑部分单词,因此所有这 4 条记录都应计入单词dog:
CREATE TABLE dbo.words
(
id bigint NOT NULL IDENTITY (1, 1),
word varchar(32) NOT NULL,
hits int NULL
)
CREATE TABLE dbo.items
(
id bigint NOT NULL IDENTITY (1, 1),
body varchar(256) NOT NULL,
)
Run Code Online (Sandbox Code Playgroud)
最后一个例子应该算作一个记录(至少包含一次“狗”一词)。
我可以使用这个查询:
UPDATE dbo.words
SET hits = (SELECT COUNT(*) FROM dbo.items WHERE body like '%' + word + '%')
Run Code Online (Sandbox Code Playgroud)
但是,这非常慢,这将需要 10 多分钟才能在我拥有的不太重的服务器上完成。
AFAIK 索引无济于事,我正在做 LIKE 搜索。我也认为全文不会帮助我,因为我正在寻找开始、结束或包含我的搜索词的单词。我在这里可能是错的。
关于如何加快速度的任何建议?
我发现加速领先的通配符LIKE搜索的最佳方法是使用 n-gram。我描述了该技术并在 SQL Server中的Trigram 通配符字符串搜索中提供了一个示例实现。
三元组搜索的基本思想非常简单:
- 保留目标数据的三字符子串(三元组)。
- 将搜索词拆分为三元组。
- 将搜索三元组与存储的三元组匹配(相等搜索)。
- 将符合条件的行相交以查找与所有三元组匹配的字符串。
- 将原始搜索过滤器应用于大幅减少的交集。
它可能适合您的需求,但请注意:
Trigram 搜索不是灵丹妙药。额外的存储要求、实施复杂性以及对更新性能的影响都严重影响了它。
我用跑快速测试莎士比亚全集填充body的列items有15,838行的表。我在words表格中加载了来自同一文本的 7,669 个唯一单词。
在我的中端笔记本电脑上,三元组结构在大约 2 秒内构建,以下更新语句在5 秒内完成:
UPDATE dbo.words WITH (TABLOCK)
SET hits =
(
SELECT COUNT_BIG(*)
FROM dbo.Items_TrigramSearch
('%' + word +'%') AS ITS
);
Run Code Online (Sandbox Code Playgroud)
更新的词表的选择:
我文章中修改后的三元组脚本如下:
CREATE FUNCTION dbo.GenerateTrigrams (@string varchar(255))
RETURNS table
WITH SCHEMABINDING
AS RETURN
WITH
N16 AS
(
SELECT V.v
FROM
(
VALUES
(0),(0),(0),(0),(0),(0),(0),(0),
(0),(0),(0),(0),(0),(0),(0),(0)
) AS V (v)),
-- Numbers table (256)
Nums AS
(
SELECT n = ROW_NUMBER() OVER (ORDER BY A.v)
FROM N16 AS A
CROSS JOIN N16 AS B
),
Trigrams AS
(
-- Every 3-character substring
SELECT TOP (CASE WHEN LEN(@string) > 2 THEN LEN(@string) - 2 ELSE 0 END)
trigram = SUBSTRING(@string, N.n, 3)
FROM Nums AS N
ORDER BY N.n
)
-- Remove duplicates and ensure all three characters are alphanumeric
SELECT DISTINCT
T.trigram
FROM Trigrams AS T
WHERE
-- Binary collation comparison so ranges work as expected
T.trigram COLLATE Latin1_General_BIN2 NOT LIKE '%[^A-Z0-9a-z]%';
GO
-- Trigrams for items table
CREATE TABLE dbo.ItemsTrigrams
(
id integer NOT NULL,
trigram char(3) NOT NULL
);
GO
-- Generate trigrams
INSERT dbo.ItemsTrigrams WITH (TABLOCKX)
(id, trigram)
SELECT
E.id,
GT.trigram
FROM dbo.items AS E
CROSS APPLY dbo.GenerateTrigrams(E.body) AS GT;
GO
-- Trigram search index
CREATE UNIQUE CLUSTERED INDEX
[CUQ dbo.ItemsTrigrams (trigram, id)]
ON dbo.ItemsTrigrams (trigram, id)
WITH (DATA_COMPRESSION = ROW);
GO
-- Selectivity of each trigram (performance optimization)
CREATE OR ALTER VIEW dbo.ItemsTrigramCounts
WITH SCHEMABINDING
AS
SELECT ET.trigram, cnt = COUNT_BIG(*)
FROM dbo.ItemsTrigrams AS ET
GROUP BY ET.trigram;
GO
-- Materialize the view
CREATE UNIQUE CLUSTERED INDEX
[CUQ dbo.ItemsTrigramCounts (trigram)]
ON dbo.ItemsTrigramCounts (trigram);
GO
-- Most selective trigrams for a search string
-- Always returns a row (NULLs if no trigrams found)
CREATE FUNCTION dbo.Items_GetBestTrigrams (@string varchar(255))
RETURNS table
WITH SCHEMABINDING AS
RETURN
SELECT
-- Pivot
trigram1 = MAX(CASE WHEN BT.rn = 1 THEN BT.trigram END),
trigram2 = MAX(CASE WHEN BT.rn = 2 THEN BT.trigram END),
trigram3 = MAX(CASE WHEN BT.rn = 3 THEN BT.trigram END)
FROM
(
-- Generate trigrams for the search string
-- and choose the most selective three
SELECT TOP (3)
rn = ROW_NUMBER() OVER (
ORDER BY ETC.cnt ASC),
GT.trigram
FROM dbo.GenerateTrigrams(@string) AS GT
JOIN dbo.ItemsTrigramCounts AS ETC
WITH (NOEXPAND)
ON ETC.trigram = GT.trigram
ORDER BY
ETC.cnt ASC
) AS BT;
GO
-- Returns Example ids matching all provided (non-null) trigrams
CREATE FUNCTION dbo.Items_GetTrigramMatchIDs
(
@Trigram1 char(3),
@Trigram2 char(3),
@Trigram3 char(3)
)
RETURNS @IDs table (id integer PRIMARY KEY)
WITH SCHEMABINDING AS
BEGIN
IF @Trigram1 IS NOT NULL
BEGIN
IF @Trigram2 IS NOT NULL
BEGIN
IF @Trigram3 IS NOT NULL
BEGIN
-- 3 trigrams available
INSERT @IDs (id)
SELECT ET1.id
FROM dbo.ItemsTrigrams AS ET1
WHERE ET1.trigram = @Trigram1
INTERSECT
SELECT ET2.id
FROM dbo.ItemsTrigrams AS ET2
WHERE ET2.trigram = @Trigram2
INTERSECT
SELECT ET3.id
FROM dbo.ItemsTrigrams AS ET3
WHERE ET3.trigram = @Trigram3
OPTION (MERGE JOIN);
END;
ELSE
BEGIN
-- 2 trigrams available
INSERT @IDs (id)
SELECT ET1.id
FROM dbo.ItemsTrigrams AS ET1
WHERE ET1.trigram = @Trigram1
INTERSECT
SELECT ET2.id
FROM dbo.ItemsTrigrams AS ET2
WHERE ET2.trigram = @Trigram2
OPTION (MERGE JOIN);
END;
END;
ELSE
BEGIN
-- 1 trigram available
INSERT @IDs (id)
SELECT ET1.id
FROM dbo.ItemsTrigrams AS ET1
WHERE ET1.trigram = @Trigram1;
END;
END;
RETURN;
END;
GO
-- Search implementation
CREATE FUNCTION dbo.Items_TrigramSearch
(
@Search varchar(255)
)
RETURNS table
WITH SCHEMABINDING
AS
RETURN
SELECT
Result.body
FROM dbo.Items_GetBestTrigrams(@Search) AS GBT
CROSS APPLY
(
-- Trigram search
SELECT
E.id,
E.body
FROM dbo.Items_GetTrigramMatchIDs
(GBT.trigram1, GBT.trigram2, GBT.trigram3) AS MID
JOIN dbo.Items AS E
ON E.id = MID.id
WHERE
-- At least one trigram found
GBT.trigram1 IS NOT NULL
AND E.body LIKE @Search
UNION ALL
-- Non-trigram search
SELECT
E.id,
E.body
FROM dbo.Items AS E
WHERE
-- No trigram found
GBT.trigram1 IS NULL
AND E.body LIKE @Search
) AS Result;
Run Code Online (Sandbox Code Playgroud)
唯一的其他更改是向表中添加聚集索引items:
CREATE UNIQUE CLUSTERED INDEX cuq ON dbo.items (id);
Run Code Online (Sandbox Code Playgroud)