jra*_*ara 5 sql sql-server database-design data-warehouse dimensional-modeling
我正在尝试实现Kimball数据集市,该市场在维度表中使用-1和-2行用于迟到的维度和NULL业务键。我下面有一个示例代码,该示例代码为事实和维度数据创建一个登台表,为数据集市创建两个维度表和一个事实表。这是我在SQL中使用数据的示例代码:
--drop table stg_sales
--go
CREATE TABLE dbo.stg_sales
(
stg_sales_id INT IDENTITY(1, 1) NOT NULL,
sales_number INT NOT NULL,
sales_amt INT NULL,
cust_number INT NULL,
cust_firstname NVARCHAR(50) NULL,
cust_lastname NVARCHAR(100) NULL,
cust_address NVARCHAR(500) NULL,
salesperson_number INT NULL,
CONSTRAINT pk_stg_sales PRIMARY KEY (stg_sales_id)
)
go
INSERT stg_sales
(sales_number,
sales_amt,
cust_number,
cust_firstname,
cust_lastname,
cust_address,
salesperson_number)
VALUES (123,
434,
2342,
'Jim',
'Moriaty',
'something',
23)
INSERT stg_sales
(sales_number,
sales_amt,
cust_number,
cust_firstname,
cust_lastname,
cust_address,
salesperson_number)
VALUES (124,
234,
2342,
'Jim',
'Moriaty',
'something',
23)
INSERT stg_sales
(sales_number,
sales_amt,
cust_number,
cust_firstname,
cust_lastname,
cust_address,
salesperson_number)
VALUES (125,
434,
4545,
'Joe',
'Esk',
'someother',
24)
INSERT stg_sales
(sales_number,
sales_amt,
cust_number,
cust_firstname,
cust_lastname,
cust_address,
salesperson_number)
VALUES (126,
434,
5555,
'Daniel',
'Hart',
'Someaddr',
NULL) --salesperson_number business key missing here
INSERT stg_sales
(sales_number,
sales_amt,
cust_number,
cust_firstname,
cust_lastname,
cust_address,
salesperson_number)
VALUES (127,
333,
4444,
'Pat',
'Smith',
'Someaddr',
30)
SELECT *
FROM stg_sales
--create a dimension and fact tables
--drop table dbo.dim_customer
--go
CREATE TABLE dbo.dim_customer
(
customer_wid INT IDENTITY(1, 1) NOT NULL,
cust_number INT NULL,
cust_firstname NVARCHAR(50) NULL,
cust_lastname NVARCHAR(100) NULL,
cust_address NVARCHAR(500) NULL,
date_insert DATETIME2 NOT NULL DEFAULT (Getdate()),
date_update DATETIME2 NULL,
is_current BIT NOT NULL
CONSTRAINT pk_dim_customer PRIMARY KEY (customer_wid)
CONSTRAINT chk_is_current CHECK (is_current IN (0, 1))
)
go
SET IDENTITY_INSERT dbo.dim_customer ON
INSERT dbo.dim_customer
(customer_wid,
cust_number,
cust_firstname,
cust_lastname,
cust_address,
date_insert,
date_update,
is_current)
VALUES (-1,
-1,
'unknown',
'unknown',
'unknown',
Getdate(),
Getdate(),
1)
INSERT dbo.dim_customer
(customer_wid,
cust_number,
cust_firstname,
cust_lastname,
cust_address,
date_insert,
date_update,
is_current)
VALUES (-2,
-2,
'Error',
'Error',
'Error',
Getdate(),
Getdate(),
1)
SET IDENTITY_INSERT dbo.dim_customer OFF
--insert data into dimension table
INSERT dbo.dim_customer
(cust_number,
cust_firstname,
cust_lastname,
cust_address,
is_current)
SELECT DISTINCT cust_number,
cust_firstname,
cust_lastname,
cust_address,
1 AS is_current
FROM dbo.stg_sales
WHERE cust_number <> 4444 --left one record off to simulate the situation where you don't have corrensponding row in dim table (late arriving dimension)
SELECT *
FROM dbo.dim_customer
DROP TABLE dbo.dim_salesperson
--create salesperson table
CREATE TABLE dbo.dim_salesperson
(
salesperson_wid INT IDENTITY(1, 1) NOT NULL,
salesperson_number INT NULL,
salesperson_firstname NVARCHAR(50) NULL,
salesperson_lastname NVARCHAR(100) NULL,
salesperson_address NVARCHAR(500) NULL,
date_insert DATETIME2 NOT NULL DEFAULT (Getdate()),
date_update DATETIME2 NULL,
is_current BIT NOT NULL
CONSTRAINT pk_dim_salesperson PRIMARY KEY (salesperson_wid)
CONSTRAINT chk_dim_salesperson_is_current CHECK (is_current IN (0, 1))
)
go
SET IDENTITY_INSERT dbo.dim_salesperson ON
INSERT dbo.dim_salesperson
(salesperson_wid,
salesperson_number,
salesperson_firstname,
salesperson_lastname,
salesperson_address,
date_insert,
date_update,
is_current)
VALUES (-1,
-1,
'Not available',
'Not available',
'Not available',
Getdate(),
Getdate(),
1)
INSERT dbo.dim_salesperson
(salesperson_wid,
salesperson_number,
salesperson_firstname,
salesperson_lastname,
salesperson_address,
date_insert,
date_update,
is_current)
VALUES (-2,
-2,
'Error',
'Error',
'Error',
Getdate(),
Getdate(),
1)
SET IDENTITY_INSERT dbo.dim_salesperson OFF
--insert data into salesperson
INSERT dbo.dim_salesperson
(salesperson_number,
salesperson_firstname,
salesperson_lastname,
salesperson_address,
is_current)
VALUES (23,
'John',
'Fox',
'something',
1)
INSERT dbo.dim_salesperson
(salesperson_number,
salesperson_firstname,
salesperson_lastname,
salesperson_address,
is_current)
VALUES (24,
'Hadley',
'Fox',
'something',
1)
INSERT dbo.dim_salesperson
(salesperson_number,
salesperson_firstname,
salesperson_lastname,
salesperson_address,
is_current)
VALUES (30,
'Ashley',
'Fox',
'something',
1)
SELECT *
FROM dbo.dim_salesperson
SELECT *
FROM dbo.stg_sales
--create and populate the fact table
--drop table dbo.f_sales
--go
CREATE TABLE dbo.f_sales
(
sales_number INT NOT NULL,
customer_wid INT NOT NULL,
salesperson_wid INT NOT NULL,
sales_amt INT NULL
CONSTRAINT pk_f_sales PRIMARY KEY (sales_number)
CONSTRAINT fk_customer_wid FOREIGN KEY (customer_wid) REFERENCES
dbo.dim_customer(customer_wid),
CONSTRAINT fk_salesperson_wid FOREIGN KEY (salesperson_wid) REFERENCES
dbo.dim_salesperson(salesperson_wid)
)
--populate the fact table
INSERT dbo.f_sales
(sales_number,
customer_wid,
salesperson_wid,
sales_amt)
SELECT stg.sales_number,
Isnull(dimcust.customer_wid, -1) AS customer_wid,
--this is maybe correct way to assign -1 foreign key when there is no corresponding dimension row in the dim table
Isnull(dimsp.salesperson_wid, -2) AS salesperson_wid,
--NOT CORRECT, how to assign -2 foreign key when the business key is NULL in the source?
stg.sales_amt
FROM dbo.stg_sales AS stg
LEFT JOIN dbo.dim_customer AS dimcust
ON stg.cust_number = dimcust.cust_number
LEFT JOIN dbo.dim_salesperson AS dimsp
ON stg.salesperson_number = dimsp.salesperson_number
SELECT *
FROM dbo.f_sales
Run Code Online (Sandbox Code Playgroud)
如何为源系统中缺少业务关键字的行分配-2。您可以从Kimball阅读有关此实现背后的理论的更多信息:
这基本上是我要实现的目标:
编辑:
我想我可以在左联接中使用COALESCE或ISNULL,这似乎会产生正确的结果:
INSERT dbo.f_sales
(sales_number,
customer_wid,
salesperson_wid,
sales_amt)
SELECT stg.sales_number,
Isnull(dimcust.customer_wid, -1) AS customer_wid,
--this is maybe correct way to assign -1 foreign key when there is no corresponding dimension row in the dim table
dimsp.salesperson_wid,
stg.sales_amt
FROM dbo.stg_sales AS stg
LEFT JOIN dbo.dim_customer AS dimcust
ON COALESCE(stg.cust_number, -2) = dimcust.cust_number
LEFT JOIN dbo.dim_salesperson AS dimsp
ON COALESCE(stg.salesperson_number, -2) = dimsp.salesperson_number
Run Code Online (Sandbox Code Playgroud)
纯粹作为一种查找技术
-- add nullable keys to the staging table
alter table dbo.stg_sales ADD
sales_person_wid integer null
, customer_wid integer null
;
-- insert to staging table here (as in your example)
-- lookup sales person key
update dbo.stg_sales
set sales_person_wid = p.sales_person_wid
from dbo.stg_sales as s , dbo.dim_salesperson as p
where s.salesperson_number = p.salesperson_number ;
-- decide what to do with missing business keys
update dbo.stg_sales
set sales_person_wid = -2
where sales_person_wid is null ;
-- do similar for customer
-- now all keys in staging table are not null
-- load to fact table
Run Code Online (Sandbox Code Playgroud)
然而,通常的技术是在提取或清理期间为事务分配特殊(未知、不适用、错误)业务密钥。换句话说,Error可以在记录进入暂存表之前分配特殊的业务键。
最后要注意的是,迟到维度意味着业务密钥 ( salesperson_number) 为操作系统所知,但交易(销售事实)在维度数据之前到达仓库。因此,salesperson_number将是not null,但不会存在于维度表中。您必须将此交易保存在某处,并在记录到达维度后尝试更新 FK (salesperson_wid);大约一天后。