Lar*_*mie 6 t-sql statistics tile sql-server-2008
我需要一种类似于NTILE()提供的T-SQL排名方法,除了每个图块的成员都在滑动分布上,以便更高排名的图块具有更少的成员.
例如
CREATE TABLE #Rank_Table(
id int identity(1,1) not null,
hits bigint not null default 0,
PERCENTILE smallint null
)
--Slant the distribution of the data
INSERT INTO #Rank_Table (hits)
select CASE
when DATA > 9500 THEN DATA*30
WHEN data > 8000 THEN DATA*5
WHEN data < 7000 THEN DATA/3 +1
ELSE DATA
END
FROM
(select top 10000 (ABS(CHECKSUM(NewId())) % 99 +1) * (ABS(CHECKSUM(NewId())) % 99 +1 ) DATA
from master..spt_values t1
cross JOIN master..spt_values t2) exponential
Declare @hitsPerGroup as bigint
Declare @numGroups as smallint
set @numGroups=100
select @hitsPerGroup=SUM(hits)/(@numGroups -1) FROM #Rank_Table
select @hitsPerGroup HITS_PER_GROUP
--This is an even distribution
SELECT id,HITS, NTILE(@numGroups) Over (Order By HITS DESC) PERCENTILE
FROM #Rank_Table
GROUP by id, HITS
--This is my best attempt, but it skips groups because of the erratic distribution
select
T1.ID,
T1.hits,
T.RunningTotal/@hitsPerGroup + 1 TILE,
T.RunningTotal
FROM #Rank_Table T1
CROSS APPLY ( Select SUM(hits) RunningTotal FROM #Rank_Table where hits <= T1.hits) T
order by T1.hits
DROP TABLE #Rank_Table
Run Code Online (Sandbox Code Playgroud)
在#Rank_table中,NTILE(@numGroups)创建@numGroups组的均匀分布.我需要的是@numGroups组,其中tile 1具有最少的成员,tile 2将具有一个或多个tile 1,tile 3将具有1或多于tile 2 ... tile 100将具有最多.
我正在使用SQL Server 2008.实际上,这将针对可能有数百万行的永久表运行,以便定期更新PERCENTILE列,其百分位数为1-100.
我上面的最佳尝试将跳过百分位并且表现不佳.肯定有更好的办法.
为了创建更线性的分布,我向数据表 HITS_SQRT 添加了一个计算列HITS_SQRT AS (CONVERT([int],sqrt(HITS*4),(0))) PERSISTED
。
使用此列,您可以计算“每百分位点击数”的目标数量。
select @hitsPerGroup=SUM(HITS_SQRT)/(@numGroups -1)-@numGroups, @dataPoints=COUNT(*) FROM #Rank_Table
Run Code Online (Sandbox Code Playgroud)
然后,该脚本创建一个临时表,其中包含按命中数排序的 ROW_NUMBER() 并按降序迭代行,将其百分位数从 100 更新为 1。 保留命中数的运行总计,并且当传递时@hitsPerGroup
,百分位数从 100 降低到 99、99 降低到 98,等等。
然后使用其百分位数更新源数据表。临时工作表有一个索引以加快更新速度。
#Rank_Table
用作源数据表的完整脚本。
--Create Test Data
CREATE TABLE #Rank_Table(
id int identity(1,1) not null,
hits bigint not null default 0,
PERCENTILE smallint NULL,
HITS_SQRT AS (CONVERT([int],sqrt(HITS*4),(0))) PERSISTED
)
--Slant the distribution of the data
INSERT INTO #Rank_Table (hits)
select CASE
when DATA > 9500 THEN DATA*30
WHEN data > 8000 THEN DATA*5
WHEN data < 7000 THEN DATA/3 +1
ELSE DATA
END
FROM
(select top 10000 (ABS(CHECKSUM(NewId())) % 99 +1) * (ABS(CHECKSUM(NewId())) % 99 +1 ) DATA
from master..spt_values t1
cross JOIN master..spt_values t2) exponential
--Create temp work table and variables to calculate percentiles
Declare @hitsPerGroup as int
Declare @numGroups as int
Declare @dataPoints as int
set @numGroups=100
select @hitsPerGroup=SUM(HITS_SQRT)/(@numGroups -1)-@numGroups, @dataPoints=COUNT(*) FROM #Rank_Table
--show the number of hits that each group should have
select @hitsPerGroup HITS_PER_GROUP
--Use temp table for the calculation
CREATE TABLE #tbl (
row int,
hits int,
ID bigint,
PERCENTILE smallint null
)
--add index to row
CREATE CLUSTERED INDEX idxRow ON #tbl(row)
insert INTO #tbl
select ROW_NUMBER() over (ORDER BY HITS), hits_SQRT, ID, null from #Rank_Table
--Update each row with a running total.
--lower the percentile by one when we cross a threshold for the maximum number of hits per group (@hitsPerGroup)
DECLARE @row as int
DEClare @runningTotal as int
declare @percentile int
set @row = 0
set @runningTotal = 0
set @percentile = @numGroups
while @row <= @dataPoints
BEGIN
select @runningTotal=@runningTotal + hits from #tbl where row=@row
if @runningTotal >= @hitsPerGroup
BEGIN
update #tbl
set PERCENTILE=@percentile
WHERE PERCENTILE is null and row <@row
set @percentile = @percentile - 1
set @runningTotal = 0
END
--change rows
set @row = @row + 1
END
--get remaining
update #tbl
set PERCENTILE=@percentile
WHERE PERCENTILE is null
--update source data
UPDATE m SET PERCENTILE = t.PERCENTILE
FROM #tbl t
inner join #Rank_Table m on t.ID=m.ID
--Show the results
SELECT PERCENTILE, COUNT(id) NUMBER_RECORDS, SUM(HITS) HITS_IN_PERCENTILE
FROM #Rank_Table
GROUP BY PERCENTILE
ORDER BY PERCENTILE
--cleanup
DROP TABLE #Rank_Table
DROP TABLE #tbl
Run Code Online (Sandbox Code Playgroud)
性能不是很出色,但它实现了平滑滑动分布的目标。