如何在Kimball Data Mart中处理延迟到达的维和NULL业务密钥?

jra*_*ara 5 sql sql-server database-design data-warehouse dimensional-modeling

我正在尝试实现Kimball数据集市,该市场在维度表中使用-1和-2行用于迟到的维度和NULL业务键。我下面有一个示例代码,该示例代码为事实和维度数据创建一个登台表,为数据集市创建两个维度表和一个事实表。这是我在SQL中使用数据的示例代码:

--drop table stg_sales
--go
CREATE TABLE dbo.stg_sales
  (
     stg_sales_id       INT IDENTITY(1, 1) NOT NULL,
     sales_number       INT NOT NULL,
     sales_amt          INT NULL,
     cust_number        INT NULL,
     cust_firstname     NVARCHAR(50) NULL,
     cust_lastname      NVARCHAR(100) NULL,
     cust_address       NVARCHAR(500) NULL,
     salesperson_number INT NULL,
     CONSTRAINT pk_stg_sales PRIMARY KEY (stg_sales_id)
  )

go

INSERT stg_sales
       (sales_number,
        sales_amt,
        cust_number,
        cust_firstname,
        cust_lastname,
        cust_address,
        salesperson_number)
VALUES (123,
        434,
        2342,
        'Jim',
        'Moriaty',
        'something',
        23)

INSERT stg_sales
       (sales_number,
        sales_amt,
        cust_number,
        cust_firstname,
        cust_lastname,
        cust_address,
        salesperson_number)
VALUES (124,
        234,
        2342,
        'Jim',
        'Moriaty',
        'something',
        23)

INSERT stg_sales
       (sales_number,
        sales_amt,
        cust_number,
        cust_firstname,
        cust_lastname,
        cust_address,
        salesperson_number)
VALUES (125,
        434,
        4545,
        'Joe',
        'Esk',
        'someother',
        24)

INSERT stg_sales
       (sales_number,
        sales_amt,
        cust_number,
        cust_firstname,
        cust_lastname,
        cust_address,
        salesperson_number)
VALUES (126,
        434,
        5555,
        'Daniel',
        'Hart',
        'Someaddr',
        NULL) --salesperson_number business key missing here

INSERT stg_sales
       (sales_number,
        sales_amt,
        cust_number,
        cust_firstname,
        cust_lastname,
        cust_address,
        salesperson_number)
VALUES (127,
        333,
        4444,
        'Pat',
        'Smith',
        'Someaddr',
        30)

SELECT *
FROM   stg_sales

--create a dimension and fact tables
--drop table dbo.dim_customer
--go
CREATE TABLE dbo.dim_customer
  (
     customer_wid   INT IDENTITY(1, 1) NOT NULL,
     cust_number    INT NULL,
     cust_firstname NVARCHAR(50) NULL,
     cust_lastname  NVARCHAR(100) NULL,
     cust_address   NVARCHAR(500) NULL,
     date_insert    DATETIME2 NOT NULL DEFAULT (Getdate()),
     date_update    DATETIME2 NULL,
     is_current     BIT NOT NULL
     CONSTRAINT pk_dim_customer PRIMARY KEY (customer_wid)
     CONSTRAINT chk_is_current CHECK (is_current IN (0, 1))
  )

go

SET IDENTITY_INSERT dbo.dim_customer ON

INSERT dbo.dim_customer
       (customer_wid,
        cust_number,
        cust_firstname,
        cust_lastname,
        cust_address,
        date_insert,
        date_update,
        is_current)
VALUES (-1,
        -1,
        'unknown',
        'unknown',
        'unknown',
        Getdate(),
        Getdate(),
        1)

INSERT dbo.dim_customer
       (customer_wid,
        cust_number,
        cust_firstname,
        cust_lastname,
        cust_address,
        date_insert,
        date_update,
        is_current)
VALUES (-2,
        -2,
        'Error',
        'Error',
        'Error',
        Getdate(),
        Getdate(),
        1)

SET IDENTITY_INSERT dbo.dim_customer OFF

--insert data into dimension table
INSERT dbo.dim_customer
       (cust_number,
        cust_firstname,
        cust_lastname,
        cust_address,
        is_current)
SELECT DISTINCT cust_number,
                cust_firstname,
                cust_lastname,
                cust_address,
                1 AS is_current
FROM   dbo.stg_sales
WHERE  cust_number <> 4444 --left one record off to simulate the situation where you don't have corrensponding row in dim table (late arriving dimension)
SELECT *
FROM   dbo.dim_customer

DROP TABLE dbo.dim_salesperson

--create salesperson table
CREATE TABLE dbo.dim_salesperson
  (
     salesperson_wid       INT IDENTITY(1, 1) NOT NULL,
     salesperson_number    INT NULL,
     salesperson_firstname NVARCHAR(50) NULL,
     salesperson_lastname  NVARCHAR(100) NULL,
     salesperson_address   NVARCHAR(500) NULL,
     date_insert           DATETIME2 NOT NULL DEFAULT (Getdate()),
     date_update           DATETIME2 NULL,
     is_current            BIT NOT NULL
     CONSTRAINT pk_dim_salesperson PRIMARY KEY (salesperson_wid)
     CONSTRAINT chk_dim_salesperson_is_current CHECK (is_current IN (0, 1))
  )

go

SET IDENTITY_INSERT dbo.dim_salesperson ON

INSERT dbo.dim_salesperson
       (salesperson_wid,
        salesperson_number,
        salesperson_firstname,
        salesperson_lastname,
        salesperson_address,
        date_insert,
        date_update,
        is_current)
VALUES (-1,
        -1,
        'Not available',
        'Not available',
        'Not available',
        Getdate(),
        Getdate(),
        1)

INSERT dbo.dim_salesperson
       (salesperson_wid,
        salesperson_number,
        salesperson_firstname,
        salesperson_lastname,
        salesperson_address,
        date_insert,
        date_update,
        is_current)
VALUES (-2,
        -2,
        'Error',
        'Error',
        'Error',
        Getdate(),
        Getdate(),
        1)

SET IDENTITY_INSERT dbo.dim_salesperson OFF

--insert data into salesperson
INSERT dbo.dim_salesperson
       (salesperson_number,
        salesperson_firstname,
        salesperson_lastname,
        salesperson_address,
        is_current)
VALUES (23,
        'John',
        'Fox',
        'something',
        1)

INSERT dbo.dim_salesperson
       (salesperson_number,
        salesperson_firstname,
        salesperson_lastname,
        salesperson_address,
        is_current)
VALUES (24,
        'Hadley',
        'Fox',
        'something',
        1)

INSERT dbo.dim_salesperson
       (salesperson_number,
        salesperson_firstname,
        salesperson_lastname,
        salesperson_address,
        is_current)
VALUES (30,
        'Ashley',
        'Fox',
        'something',
        1)

SELECT *
FROM   dbo.dim_salesperson

SELECT *
FROM   dbo.stg_sales

--create and populate the fact table
--drop table dbo.f_sales
--go
CREATE TABLE dbo.f_sales
  (
     sales_number    INT NOT NULL,
     customer_wid    INT NOT NULL,
     salesperson_wid INT NOT NULL,
     sales_amt       INT NULL
     CONSTRAINT pk_f_sales PRIMARY KEY (sales_number)
     CONSTRAINT fk_customer_wid FOREIGN KEY (customer_wid) REFERENCES
     dbo.dim_customer(customer_wid),
     CONSTRAINT fk_salesperson_wid FOREIGN KEY (salesperson_wid) REFERENCES
     dbo.dim_salesperson(salesperson_wid)
  )

--populate the fact table 
INSERT dbo.f_sales
       (sales_number,
        customer_wid,
        salesperson_wid,
        sales_amt)
SELECT stg.sales_number,
       Isnull(dimcust.customer_wid, -1)  AS customer_wid,
       --this is maybe correct way to assign -1 foreign key when there is no corresponding dimension row in the dim table
       Isnull(dimsp.salesperson_wid, -2) AS salesperson_wid,
       --NOT CORRECT, how to assign -2 foreign key when the business key is NULL in the source?
       stg.sales_amt
FROM   dbo.stg_sales AS stg
       LEFT JOIN dbo.dim_customer AS dimcust
              ON stg.cust_number = dimcust.cust_number
       LEFT JOIN dbo.dim_salesperson AS dimsp
              ON stg.salesperson_number = dimsp.salesperson_number

SELECT *
FROM   dbo.f_sales
Run Code Online (Sandbox Code Playgroud)

如何为源系统中缺少业务关键字的行分配-2。您可以从Kimball阅读有关此实现背后的理论的更多信息:

这基本上是我要实现的目标:

处理尺寸和度量为NULL

编辑:

我想我可以在左联接中使用COALESCEISNULL,这似乎会产生正确的结果:

INSERT dbo.f_sales
       (sales_number,
        customer_wid,
        salesperson_wid,
        sales_amt)
SELECT stg.sales_number,
       Isnull(dimcust.customer_wid, -1)  AS customer_wid,
       --this is maybe correct way to assign -1 foreign key when there is no corresponding dimension row in the dim table
       dimsp.salesperson_wid,
       stg.sales_amt
FROM   dbo.stg_sales AS stg
       LEFT JOIN dbo.dim_customer AS dimcust
              ON COALESCE(stg.cust_number, -2) = dimcust.cust_number
       LEFT JOIN dbo.dim_salesperson AS dimsp
              ON COALESCE(stg.salesperson_number, -2) = dimsp.salesperson_number
Run Code Online (Sandbox Code Playgroud)

Dam*_*vic 4

纯粹作为一种查找技术

-- add nullable keys to the staging table
alter table dbo.stg_sales ADD
  sales_person_wid integer null
, customer_wid     integer null
;

-- insert to staging table here (as in your example)

-- lookup sales person key
update dbo.stg_sales 
 set sales_person_wid = p.sales_person_wid
from dbo.stg_sales as s , dbo.dim_salesperson as p
where s.salesperson_number =  p.salesperson_number ;

-- decide what to do with missing business keys
update dbo.stg_sales 
 set sales_person_wid = -2
where sales_person_wid is null ;


-- do similar for customer

-- now all keys in staging table are not null

-- load to fact table
Run Code Online (Sandbox Code Playgroud)

然而,通常的技术是在提取或清理期间为事务分配特殊(未知、不适用、错误)业务密钥。换句话说,Error可以在记录进入暂存表之前分配特殊的业务键。

最后要注意的是,迟到维度意味着业务密钥 ( salesperson_number) 为操作系统所知,但交易(销售事实)在维度数据之前到达仓库。因此,salesperson_number将是not null,但不会存在于维度表中。您必须将此交易保存在某处,并在记录到达维度后尝试更新 FK (salesperson_wid);大约一天后。