use*_*740 6 postgresql pivot postgresql-9.1
我有如下数据:
created_at | 地位 ---------------------+------------ 2016-04-05 1:27:15 | 信息 2016-04-05 3:27:15 | 信息 2016-04-05 5:27:15 | 警告 2016-04-05 10:27:15 | 信息 2016-04-05 11:27:15 | 警告
有了这些数据,我想转换如下:
状态 | 2016-04-05 1:00:00 | 2016-04-05 4:00:00 | 2016-04-05 8:00:00 | 2016-04-05 12:00:00 ---------+--------------------+------------ -+--------------------+-------------------- 信息 | 1 | 1 | 0 | 1 警告 | 0 | 0 | 1 | 1
谁能建议最好的方法来做到这一点?
假设2016-04-05 0:27:15
而不是在基础表中,这个问题对我来说更有意义:2016-04-05 1:27:15
CREATE TABLE tbl (created_at timestamp, status text);
INSERT INTO tbl VALUES
('2016-04-05 00:27:15', 'info')
, ('2016-04-05 03:27:15', 'info')
, ('2016-04-05 05:27:15', 'warn')
, ('2016-04-05 10:27:15', 'info')
, ('2016-04-05 11:27:15', 'warn');
Run Code Online (Sandbox Code Playgroud)
逻辑是计算发生在并排除下一个边界的事件。这与经常被忽视的功能width_bucket()
完美契合。准确地说,它需要Postgres 9.5引入的具有任意边界的变体(因为在 OP 的边界中没有常规模式)。直接从手册解释:
Run Code Online (Sandbox Code Playgroud)width_bucket(operand anyelement, thresholds anyarray)
operand
给定一个列出桶的下限的数组,返回将分配给的桶号;返回0
小于第一个下限的输入;所述thresholds
阵列必须进行排序,最小第一,或意外的结果,将获得
对于常规存储桶,您也可以使用Postgres 9.1 中提供的另一种变体。
将其与crosstab()
重新使用与列名相同的边界结合起来(查询的其余部分适用于 Postgres 9.1):
SELECT * FROM crosstab(
$$SELECT status
, width_bucket(created_at, '{2016-04-05 01:00
, 2016-04-05 04:00
, 2016-04-05 08:00
, 2016-04-05 12:00}'::timestamp[])
, count(*)::int
FROM tbl
WHERE created_at < '2016-04-05 12:00' -- exclude later rows
GROUP BY 1, 2
ORDER BY 1, 2$$
, 'SELECT generate_series(0,3)'
) AS t(status text, "2016-04-05 01:00" int
, "2016-04-05 04:00" int
, "2016-04-05 08:00" int
, "2016-04-05 12:00" int);
Run Code Online (Sandbox Code Playgroud)
结果:
status | 2016-04-05 01:00 | 2016-04-05 04:00 | 2016-04-05 08:00 | 2016-04-05 12:00
--------+------------------+------------------+------------------+------------------
info | 1 | 1 | | 1
warn | | | 1 | 1
Run Code Online (Sandbox Code Playgroud)
第二个交叉表参数 ( 'SELECT generate_series(0,3)'
) 是一个查询字符串,在执行时为每个目标列返回一行。在任一侧找不到的每个值 - 不在原始数据中或不是由第二个参数生成的 - 都会被忽略。
基础知识crosstab()
:
如果你需要0
而不是NULL
在结果中,用 修复COALESCE()
,但这只是一个表面问题:
SELECT status
, COALESCE(t0, 0) AS "2016-04-05 01:00"
, COALESCE(t1, 0) AS "2016-04-05 04:00"
, COALESCE(t2, 0) AS "2016-04-05 08:00"
, COALESCE(t3, 0) AS "2016-04-05 12:00"
FROM crosstab(
$$SELECT status
, width_bucket(created_at, '{2016-04-05 01:00
, 2016-04-05 04:00
, 2016-04-05 08:00
, 2016-04-05 12:00}'::timestamp[])
, count(*)::int
FROM tbl
WHERE created_at < '2016-04-05 12:00'
GROUP BY 1, 2
ORDER BY 1, 2$$
, 'SELECT generate_series(0,3)'
) AS t(status text, t0 int, t1 int, t2 int, t3 int);
Run Code Online (Sandbox Code Playgroud)
结果:
status | 2016-04-05 01:00 | 2016-04-05 04:00 | 2016-04-05 08:00 | 2016-04-05 12:00
--------+------------------+------------------+------------------+------------------
info | 1 | 1 | 0 | 1
warn | 0 | 0 | 1 | 1
Run Code Online (Sandbox Code Playgroud)
在 Postgres 9.5+ 中添加每次status
使用的总数GROUPING SETS
SELECT status
, COALESCE(t0, 0) AS "2016-04-05 01:00"
, COALESCE(t1, 0) AS "2016-04-05 04:00"
, COALESCE(t2, 0) AS "2016-04-05 08:00"
, COALESCE(t3, 0) AS "2016-04-05 12:00"
, COALESCE(t4, 0) AS total
FROM crosstab(
$$SELECT status, COALESCE(slot, -1), ct -- special slot for totals
FROM (
SELECT status
, width_bucket(created_at, '{2016-04-05 01:00
, 2016-04-05 04:00
, 2016-04-05 08:00
, 2016-04-05 12:00}'::timestamp[]) AS slot
, count(*)::int AS ct
FROM tbl
WHERE created_at < '2016-04-05 12:00'
GROUP BY GROUPING SETS ((1, 2), 1) -- add totals per status
ORDER BY 1, 2
) sub$$
, 'VALUES (0), (1), (2), (3), (-1)' -- switched to VALUES for more sophisticated series
) AS t(status text, t0 int, t1 int, t2 int, t3 int, t4 int);
Run Code Online (Sandbox Code Playgroud)
结果如上,加上:
... | total
... -+-------
... | 3
... | 2
Run Code Online (Sandbox Code Playgroud)
请注意,total
包括聚合前未排除的所有行,即使按crosstab()
.
这是对@Vérace 在评论中的请求的答复,而不是对不清楚的问题的答复。
为了回答这个问题,我做了以下事情。(我发现以下三个线程很有帮助- 1、2和3。我还发现分别位于generate_series
和CROSSTAB
这里和这里的文档很有用)。这应该适用于 9.1 - 未测试,但文档表明没有使用 9.1 后的东西。
创建了一个表:
ntest=# create table pv_tab(created_at timestamp, status varchar(10));
Run Code Online (Sandbox Code Playgroud)
并填充它。
INSERT INTO pv_tab VALUES('2016-04-05 01:27:15', 'info');
INSERT INTO pv_tab VALUES('2016-04-05 03:27:15', 'info');
INSERT INTO pv_tab VALUES('2016-04-05 05:27:15', 'warn');
INSERT INTO pv_tab VALUES('2016-04-05 10:27:15', 'info');
INSERT INTO pv_tab VALUES('2016-04-05 11:27:15', 'warn');
INSERT INTO pv_tab VALUES('2016-04-05 00:27:15', 'info');
INSERT INTO pv_tab VALUES('2016-04-05 00:24:15', 'info');
INSERT INTO pv_tab VALUES('2016-04-05 00:24:13', 'warn');
INSERT INTO pv_tab VALUES('2016-04-05 00:24:13', 'warn');
INSERT INTO pv_tab VALUES('2016-04-05 01:24:13', 'warn');
INSERT INTO pv_tab VALUES('2016-04-05 01:24:13', 'info');
INSERT INTO pv_tab VALUES('2016-04-05 01:12:13', 'info');
INSERT INTO pv_tab VALUES('2016-04-05 01:12:22', 'info');
INSERT INTO pv_tab VALUES('2016-04-05 02:05:45', 'info');
INSERT INTO pv_tab VALUES('2016-04-05 02:34:45', 'warn');
INSERT INTO pv_tab VALUES('2016-04-05 10:34:45', 'warn');
INSERT INTO pv_tab VALUES('2016-04-05 10:35:45', 'warn');
INSERT INTO pv_tab VALUES('2016-04-05 10:36:45', 'warn');
INSERT INTO pv_tab VALUES('2016-04-05 10:36:45', 'info');
INSERT INTO pv_tab VALUES('2016-04-05 10:36:34', 'info');
(20 rows)
Run Code Online (Sandbox Code Playgroud)
我的查询的(正确)结果是:
stat slot1 slot2 slot3 slot4 slot5 slot6 Total
---- ----- ----- ----- ----- ----- ----- -----
info 6 2 0 3 0 0 11
warn 3 1 1 4 0 0 9
Run Code Online (Sandbox Code Playgroud)
工作查询是:
SELECT * FROM CROSSTAB
(
'
WITH time_slots AS
(
SELECT
status,
CASE
-- Here I put the "created_at" values into "buckets" - it would
-- not be desirable to have too many of these buckets - certainly
-- any more than 12 would make the `SQL` and result unwieldy!
-- I recommend that you create 2hr slots - 00:00 - 02:00, &c.
-- This `CTE` splits the times into the various slots sample slots
-- 1-4 - you can, of course, have more but it makes the `SQL` and
-- the answer more messy. Here, I;ve deliberately only used 4
-- out of 6 in order to illustrate dealing with sparse data in
-- the result. (I used the OP;s initial slots - easy to change).
WHEN created_at < ''2016-04-05 02:00'' THEN 1
WHEN created_at >= ''2016-04-05 02:00'' AND created_at < ''2016-04-05 04:00'' THEN 2
WHEN created_at >= ''2016-04-05 04:00'' AND created_at < ''2016-04-05 08:00'' THEN 3
WHEN created_at >= ''2016-04-05 08:00'' AND created_at < ''2016-04-05 12:00'' THEN 4
END AS time_slot,
COUNT(status) AS stat_count
FROM pv_tab
GROUP BY status, time_slot
ORDER BY status, time_slot
),
statuae AS
-- Get all statuses. Hardly necessary when there are only two, but
-- could be an issue later if more values are required ("unknown".. &c.).
(
SELECT DISTINCT(status) AS stati
FROM pv_tab
),
all_slots (slots) AS
-- This `CTE` is necessary to perform a cross-join between statuses
-- and slots. This is because the `CROSSTAB` table function doesn;t
-- appear to play well with `NULL`s - see my question to Erwin
-- Brandstetter in comments.
(
SELECT generate_series(1, 6) -- six (should be) 2 hour slots. In any case, it is arbitrary!
),
stat_slots AS
-- Here the statuses slots are cross-joined - i.e. all slots with all statuses.
(
SELECT statuae.stati, all_slots.slots
FROM statuae, all_slots
),
individual_stati AS
-- `Left-join` the complete status/slot table with the actual slots in
-- the sample table. NULL counts are `COALESCE`ed into 0 - necessary, otherwise
-- `NULL`s "back up" the result and leaves blanks in the right-most
-- columns - and the totals appear in what should be slots.
(
SELECT ss.stati AS status, ss.slots AS time_slot, COALESCE(ts.stat_count, 0) AS counts
FROM stat_slots ss
LEFT JOIN time_slots ts
ON ss.stati = ts.status AND ss.slots = ts.time_slot
ORDER BY 1, 2
),
total_stati AS
-- This is just pure showing off :-). I;m using this `CTE` to add
-- a totals field to the query. Not asked for by the OP - can be
-- ripped out! I got the idea for this from the 3rd link (top of post).
(
SELECT status, 7 AS time_slot, count(status) AS counts -- 7 - an exta slot for totals
FROM pv_tab
GROUP BY status
)
-- Final query bringing it all together - nice, simple and elegant. :-)
SELECT status, time_slot, counts FROM individual_stati
UNION
SELECT status, time_slot, counts FROM total_stati
ORDER BY 1, 2
'
) AS My_Tab("stat" varchar(10), "slot1" bigint, "slot2" bigint, "slot3" bigint, "slot4" bigint, "slot5" bigint, "slot6" bigint, "Total" bigint);
Run Code Online (Sandbox Code Playgroud)
归档时间: |
|
查看次数: |
10494 次 |
最近记录: |