SQL 查询中 WHERE 子句的替代方案

Nis*_*R05 4 sql-server sql-server-2012 cardinality-estimates

我们有 SQL Server 2012 Enterprise Edition 通过存储过程运行以下查询:

declare @TopX int = 1000
declare @stores Table (Store varchar(5), LastDate datetime, LastId int, RangeEnd datetime)
insert into @stores 
  select * 
  from (select SourceStore, '2014-01-01' as i, null as ii, '2014-01-08' as iii 
        from StoreConfig.dbo.Version 
        group by SourceStore
       ) t 
  where (ABS(CAST((BINARY_CHECKSUM(*) * RAND()) as int)) % 100) < 50

IF OBJECT_ID('tempdb..#agreements') IS NOT NULL
DROP TABLE #agreements
IF OBJECT_ID('tempdb..#stores') IS NOT NULL
DROP TABLE #stores

select Store, 
       isnull(LastDate, '1899-01-01') StartDate, 
       isnull(LastId, -1) LastId, 
       isnull(RangeEnd, getdate()) RangeEnd
into #stores
from @stores
update #stores set StartDate = '2015-07-01', RangeEnd='2015-07-08'


-- THIS IS NOT FAST.. :(

SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED;
--grab the @TopX agreements from primary customers
--note: only grabbing the columns in our index to prevent RID lookups 
  --on every agreement before we sort and take a relatively tiny subset
select top 1000
       a.SourceStore
       ,a.AgreementId
       ,isnull(a.ModifiedDate, a.CreatedDate) as ModifiedDate
       ,ca.CustomerId
from #stores s
inner join StoreOps.POSREPL3Agreement.Agreement a on a.SourceStore = s.Store
inner join StoreOps.Customer.CustomerAgreement ca on ca.SourceStore = a.SourceStore 
                                                 and ca.AgreementId = a.AgreementId 
                                                 and ca.IsPrimary = 1
where ( (a.ModifiedDate between s.StartDate and s.RangeEnd) 
       or ( a.ModifiedDate is null 
        and a.CreatedDate between s.StartDate and s.RangeEnd)
      ) 
  and ( isnull(a.ModifiedDate, a.CreatedDate) > s.StartDate 
     or a.AgreementId > s.LastId
      )
order by isnull(a.ModifiedDate, a.CreatedDate), a.AgreementId
Run Code Online (Sandbox Code Playgroud)

我们有以下[Customer].[CustomerAgreement]表索引:

CREATE NONCLUSTERED INDEX [IX_CustomerAgreement_SourceStore_AgreementId] 
ON [Customer].[CustomerAgreement]
([SourceStore] ASC, [AgreementId] ASC, [IsPrimary] ASC)
INCLUDE ([CustomerId]) 
ON [PRIMARY]
GO
Run Code Online (Sandbox Code Playgroud)

这是[POSREPL3Agreement].[Agreement]表的索引:

CREATE NONCLUSTERED INDEX [IX_Agreement_SourceStore_ModifiedDate]  
ON [POSREPL3Agreement].[Agreement]
([SourceStore] ASC, [ModifiedDate] ASC, [CreatedDate] ASC)
INCLUDE ([AgreementId]) 
ON [PRIMARY]
GO
Run Code Online (Sandbox Code Playgroud)

如果我们删除WHERE子句索引按预期工作,我们会从两个表中看到 1000 条记录,但是当我们添加列出的WHERE子句时,[Customer].[CustomerAgreement]估计所有记录而不是 1000 条。

我们如何改进WHERE子句或索引以使[Agreement]表与 [CustomerAgreement]表对齐,以便估计行[CustomerAgreement]不是所有记录?

表定义

CREATE TABLE [Customer].[CustomerAgreement](
    [CustomerAgreementId] [int] NOT NULL,
    [CustomerId] [int] NOT NULL,
    [AgreementId] [int] NOT NULL,
    [IsPrimary] [bit] NOT NULL,
    [Store] [varchar](5) NOT NULL,
    [SourceStore] [varchar](5) NOT NULL,
    [RowGUID] [uniqueidentifier] ROWGUIDCOL  NOT NULL,
    [Repl_ID] [tinyint] NOT NULL,
 CONSTRAINT [PK_Customer_CustomerAgreementID_SourceStore] PRIMARY KEY NONCLUSTERED 
(
    [CustomerAgreementId] ASC,
    [SourceStore] ASC,
    [Repl_ID] ASC,
    [RowGUID] ASC
)WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY]
) ON [PRIMARY]

CREATE TABLE [POSREPL3Agreement].[Agreement](
    [AgreementId] [int] NOT NULL,
    [QuoteId] [int] NULL,
    [AgreementNumber] [varchar](11) NOT NULL,
    [AgreementStatusId] [tinyint] NOT NULL,
    [AgreementPrinted] [bit] NOT NULL,
    [IsNewPOSCreated] [bit] NOT NULL,
    [LeaseFrequencyId] [tinyint] NOT NULL,
    [DeferredLateFeeAmount] [decimal](8, 2) NOT NULL,
    [InHomeVisitFeeAmount] [decimal](8, 2) NOT NULL,
    [IsASPTaxable] [bit] NOT NULL,
    [ServicePlusRate] [decimal](7, 5) NOT NULL,
    [ServicePlusFloor] [decimal](5, 2) NOT NULL,
    [TaxRatePercentage] [decimal](7, 5) NOT NULL,
    [IgnoreTaxRateChange] [bit] NOT NULL,
    [Balance] [decimal](8, 2) NOT NULL,
    [AmountPaidToDate] [decimal](10, 2) NOT NULL,
    [Deposit] [decimal](8, 2) NOT NULL,
    [DeliveryFee] [decimal](8, 2) NOT NULL,
    [StartDate] [datetime] NOT NULL,
    [DueDay] [int] NOT NULL,
    [DueDayTypeId] [tinyint] NULL,
    [PaidThroughDate] [datetime] NOT NULL,
    [PayOutDate] [datetime] NOT NULL,
    [FinalDate] [datetime] NULL,
    [IsNSFOutstanding] [bit] NOT NULL,
    [LeadSourceId] [tinyint] NOT NULL,
    [AgreementTypeId] [tinyint] NOT NULL,
    [AcquisitionAgreementTypeId] [tinyint] NULL,
    [SameAsCashDuration] [int] NOT NULL,
    [SameAsCashDurationType] [int] NOT NULL,
    [MinimumPercentageOfCashPriceForFinalPayment] [decimal](3, 2) NOT NULL,
    [EarlyPayoutLeaseAmountRate] [decimal](3, 2) NOT NULL,
    [EarlyPayout] [decimal](10, 2) NULL,
    [FinalPaymentAdditionalFee] [decimal](8, 2) NOT NULL,
    [FinalPaymentProrateAmount] [decimal](10, 2) NOT NULL,
    [CreditedAssociateId] [int] NULL,
    [NonRenewalGracePeriod] [int] NOT NULL,
    [LastAgreementTransactionId] [int] NULL,
    [CanPayout] [bit] NOT NULL,
    [IsServicePlusIncludedInPayout] [bit] NOT NULL,
    [IsProrateEnabled] [bit] NOT NULL,
    [AgreementDocument] [varbinary](max) NULL,
    [CreatedDate] [datetime] NOT NULL,
    [CreatedBy] [int] NULL,
    [ModifiedDate] [datetime] NULL,
    [ModifiedBy] [int] NULL,
    [Store] [varchar](5) NOT NULL,
    [SourceStore] [varchar](5) NOT NULL,
    [RowGUID] [uniqueidentifier] ROWGUIDCOL  NOT NULL,
    [Repl_ID] [tinyint] NOT NULL,
    [ECommerceDeliveredDate] [datetime] NULL,
    [DDEStatusId] [tinyint] NULL,
    [DDEAmount] [decimal](8, 2) NULL,
    [OrderMethodTypeId] [int] NULL,
    [SemiMonthlyUpcharge] [decimal](8, 2) NULL,
    [DefaultNonRenewalFee] [decimal](8, 2) NULL,
    [DefaultInHomeVisitFee] [decimal](8, 2) NULL,
    [NonRenewalSemiMonthlyFee] [decimal](8, 2) NULL,
    [NonRenewalSemiMonthlyFeeGracePeriod] [int] NULL,
    [NonRenewalFeeTypeId] [int] NULL,
    [NonRenewalSemiMonthlyRate] [decimal](8, 4) NULL,
    [NonRenewalMonthlyRate] [decimal](8, 4) NULL,
    [NonRenewalWeeklyFeeGracePeriod] [int] NULL,
    [NonRenewalWeeklyRate] [decimal](8, 4) NULL,
    [NonRenewalWeeklyFee] [decimal](8, 2) NULL,
    [DefaultNSFFee] [decimal](8, 2) NULL,
    [WeeklyUpcharge] [decimal](8, 2) NULL,
    [IsInHomeFeeEnabled] [bit] NULL,
    [CanChargeInHomeFeeAndNonRenewalFeeInSamePeriod] [bit] NULL,
    [ExtensionBalance] [decimal](8, 2) NOT NULL,
 CONSTRAINT [PK_Agreement_AgreementID_SourceStore] PRIMARY KEY NONCLUSTERED 
(
    [AgreementId] ASC,
    [SourceStore] ASC,
    [Repl_ID] ASC,
    [RowGUID] ASC
)WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY]
) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY]
Run Code Online (Sandbox Code Playgroud)

执行计划

在此处下载 Showplan XML (pastebin)

Pau*_*ite 10

如果我们删除 WHERE子句索引按预期工作,我们会从两个表中看到 1000 条记录,但是当我们添加列出的WHERE子句时,[Customer].[CustomerAgreement]估计所有记录而不是 1000 条。

简短的回答是,如果没有对子WHERE句中的行进行任何过滤,查询优化器估计它只需要从每个表中读取 1000 行即可产生 1000 行的所需结果。

您没有为此查询提供执行计划,所以我不能说更多。ORDER BY在这种情况下,您的查询很可能也省略了该子句,否则可能需要进行 Sort,这通常需要从其子树中读取所有行。

随着 WHERE子句,优化器期望在第 1000 行可返回给客户端之前读取更多行(由于预期的过滤效果)。估计的和实际读取的行数之间的差异是由于从可用统计信息估计复杂谓词的选择性的问题。即使假设统计数据具有代表性,这也是正确的。从根本上说:太难了,优化器错误地估计了基数。

1. 索引计算列解决方案

您可能会发现以下更改是值得的:

  1. 索引#stores 临时表:

    CREATE UNIQUE CLUSTERED INDEX index_name 
    ON #stores 
    (
        Store,
        StartDate
    );
    
    Run Code Online (Sandbox Code Playgroud)
  2. 将计算列添加到协议表。这不使用存储,并且是非常快速的仅元数据操作:

    ALTER TABLE POSREPL3Agreement.Agreement
    ADD ComputedModifiedDate
    AS ISNULL(ModifiedDate, CreatedDate);
    
    Run Code Online (Sandbox Code Playgroud)
  3. 创建(或修改现有)索引以使用计算列。这将满足ORDER BY条款。

    CREATE INDEX index_name 
    ON POSREPL3Agreement.Agreement
    (
        ComputedModifiedDate,
        AgreementId
    )
    INCLUDE (SourceStore);
    
    Run Code Online (Sandbox Code Playgroud)
  4. 简化查询以直接引用计算列:

    SELECT TOP (1000)
        A.SourceStore,
        A.AgreementId,
        ModifiedDate = A.ComputedModifiedDate
    FROM #stores AS S
    JOIN POSREPL3Agreement.Agreement AS A
        ON A.SourceStore = S.Store
    JOIN Customer.CustomerAgreement AS CA
        ON CA.SourceStore = A.SourceStore
        AND CA.AgreementId = A.AgreementId
    WHERE
        CA.IsPrimary = 1
        AND A.ComputedModifiedDate BETWEEN S.StartDate AND S.RangeEnd
        AND 
        ( 
            A.ComputedModifiedDate > S.StartDate
            OR A.AgreementId > S.LastId
        )
    ORDER BY
        A.ComputedModifiedDate, 
        A.AgreementId;
    
    Run Code Online (Sandbox Code Playgroud)

如果您无法按所示重写查询,由于技术原因,计算列索引将需要包含两个额外的列:

CREATE INDEX index_name 
ON POSREPL3Agreement.Agreement
(
    ComputedModifiedDate,
    AgreementId
)
INCLUDE 
(
    SourceStore,
    CreatedDate,
    ModifiedDate
)
WITH DROP_EXISTING;
Run Code Online (Sandbox Code Playgroud)

预期的执行计划仍然会显示不准确的估计(因为优化器对停止扫描有序计算列索引的速度过于乐观)但(前 N 和不同)排序被消除,并且应该仍然表现得更好:

预期执行计划

2. 全面的索引视图解决方案

如果您无法添加计算列和索引,您可以使用索引视图进行调查:

CREATE VIEW dbo.ViewName
WITH SCHEMABINDING
AS
SELECT
    A.SourceStore,
    A.AgreementId,
    ComputedModifiedDate = ISNULL(A.ModifiedDate, A.CreatedDate)
FROM POSREPL3Agreement.Agreement AS A
JOIN Customer.CustomerAgreement AS CA
    ON CA.SourceStore = A.SourceStore
    AND CA.AgreementId = A.AgreementId
WHERE
    CA.IsPrimary = 1;
GO
CREATE UNIQUE CLUSTERED INDEX index_name
ON dbo.ViewName
(
    ComputedModifiedDate,
    AgreementId,
    SourceStore
);
Run Code Online (Sandbox Code Playgroud)

然后查询变为:

SELECT TOP (1000)
    VN.SourceStore,
    VN.AgreementId,
    ModifiedDate = VN.ComputedModifiedDate 
FROM #stores AS S
JOIN dbo.ViewName AS VN
    WITH (NOEXPAND)
    ON VN.SourceStore = S.Store
WHERE
    VN.ComputedModifiedDate BETWEEN S.StartDate AND S.RangeEnd
    AND 
    ( 
        VN.ComputedModifiedDate > S.StartDate
        OR VN.AgreementId > S.LastId
    )
ORDER BY
    VN.ComputedModifiedDate, 
    VN.AgreementId;
Run Code Online (Sandbox Code Playgroud)

3. 更简单的索引视图解决方案

还可以在索引视图中更直接地反映计算列解决方案,尽管这种想法并没有消除连接:

CREATE VIEW dbo.ViewName
WITH SCHEMABINDING
AS
SELECT
    A.SourceStore,
    A.AgreementId,
    ComputedModifiedDate = ISNULL(A.ModifiedDate, A.CreatedDate)
FROM POSREPL3Agreement.Agreement AS A;
GO
CREATE UNIQUE CLUSTERED INDEX index_name
ON dbo.ViewName
(
    ComputedModifiedDate,
    AgreementId,
    SourceStore
);
Run Code Online (Sandbox Code Playgroud)

这次查询变成了:

SELECT TOP (1000)
    VN.SourceStore,
    VN.AgreementId,
    ModifiedDate = VN.ComputedModifiedDate
FROM #stores AS S
JOIN dbo.ViewName AS VN
    WITH (NOEXPAND)
    ON VN.SourceStore = S.Store
JOIN Customer.CustomerAgreement AS CA
    ON CA.SourceStore = VN.SourceStore
    AND CA.AgreementId = VN.AgreementId
WHERE
    CA.IsPrimary = 1
    AND VN.ComputedModifiedDate BETWEEN S.StartDate AND S.RangeEnd
    AND 
    ( 
        VN.ComputedModifiedDate > S.StartDate
        OR VN.AgreementId > S.LastId
    )
ORDER BY
    VN.ComputedModifiedDate, 
    VN.AgreementId;
Run Code Online (Sandbox Code Playgroud)

我不得不猜测视图中哪些列是唯一的。如果您实施任一解决方案,请考虑更广泛的工作负载,并在必要时从基表添加唯一列以使视图聚集索引唯一。该[RowGUID]列看起来喜欢这一点。


您应该仔细测试以评估索引视图对基表的数据更改的影响以及所需的存储量。

使用这些解决方案中的任何一个,如果临时表包含非常早的日期范围,或者如果存储没有最近创建或修改的日期,则性能可能不如预期。您应该对它进行测试,看看它与您的真实数据和要求如何相符。

您应该删除将隔离级别设置为的语句,READ UNCOMMITTED如果它只是为了提高性能而拼命尝试的话。详情请看我的文章

另外,我注意到两个表当前都是堆。您应该知道,出于空间管理的原因,如果没有别的原因,大多数表都可以从聚集索引中受益。如果堆经历删除,您可能需要不时重建它们以回收已完成自动释放的空页。

顺便说一句,RAND@stores 查询中的 没有您可能认为的效果。它为每一行生成相同的值(运行时常量)。