goo*_*ose 5 sql google-bigquery
我正在开发一个非常复杂的查询,需要我UNION ALL多次堆叠数据(即)。令我惊讶的是,BigQuery 不喜欢堆叠,并且试运行显示异常:
查询执行期间资源超出:没有足够的资源用于查询计划 - 子查询过多或查询过于复杂。
我已经隔离了查询中出现问题的点,以确认它似乎是UNION ALL导致问题的太多。我很惊讶UNION ALL他们会这样做,但我怀疑我在这里的想法很幼稚。
为什么 BigQuery 不能处理这个额外的UNION ALL?堆叠数据不是更直接的操作之一吗?
我有哪些选择可以达到相同的结果?是否有我不知道的操作可以完成相同的工作或替代方法?
这是完整的查询,尽管我应该注意到project.dataset.source_view它首先做了一些相对直接的处理:
WITH p0_funnel AS (
SELECT
date,
platform_type,
platform,
flow,
step_1,
step_2,
step_3,
step_4,
step_5,
step_6
FROM `project.dataset.source_view`
), p1_funnel AS (
SELECT
date,
flow,
platform_type,
platform,
SUM(step_1) AS step_1,
SUM(step_2) AS step_2,
SUM(step_3) AS step_3,
SUM(step_4) AS step_4,
SUM(step_5) AS step_5,
SUM(step_6) AS step_6
FROM p0_funnel
GROUP BY
date,
flow,
platform_type,
platform
), p2_funnel AS (
SELECT
date,
flow,
platform,
platform_type,
step_1,
step_2,
step_3,
step_4,
step_5,
step_6
FROM p1_funnel
), p3_funnel AS (
SELECT
date, platform, platform_type, flow,
'step_1' AS step,
step_1 AS step_sessions
FROM p1_funnel
UNION ALL
SELECT
date, platform, platform_type, flow,
'step_2' AS step,
step_2 AS step_sessions
FROM p1_funnel
UNION ALL
SELECT
date, platform, platform_type, flow,
'step_3' AS step,
step_3 AS step_sessions
FROM p1_funnel
UNION ALL
SELECT
date, platform, platform_type, flow,
'step_4' AS step,
step_4 AS step_sessions
FROM p1_funnel
UNION ALL
SELECT
date, platform, platform_type, flow,
'step_5' AS step,
step_5 AS step_sessions
FROM p1_funnel
UNION ALL
SELECT
date, platform, platform_type, flow,
'step_6' AS step,
step_6 AS step_sessions
FROM p1_funnel
), p4_funnel AS (
SELECT
main.date,
platform, platform_type, flow,
step,
step_1,
step_2,
step_3,
step_4,
step_5,
step_6,
step_sessions
FROM p3_funnel AS main
JOIN p2_funnel USING(date, platform, platform_type, flow)
), funnel_platform_type AS (
SELECT
date,
'platform_type' AS dimension,
platform_type AS value,
step,
step_1,
step_2,
step_3,
step_4,
step_5,
step_6,
step_sessions
FROM p4_funnel
), funnel_platform AS (
SELECT
date,
'platform' AS dimension,
platform AS value,
step,
step_1,
step_2,
step_3,
step_4,
step_5,
step_6,
step_sessions
FROM p4_funnel
), funnel_flow AS (
SELECT
date,
'flow' AS dimension,
flow AS value,
step,
step_1,
step_2,
step_3,
step_4,
step_5,
step_6,
step_sessions
FROM p4_funnel
), p5_funnel AS (
SELECT * FROM funnel_platform_type UNION ALL
SELECT * FROM funnel_platform UNION ALL
SELECT * FROM funnel_flow # including this UNION ALL first introduces the problem
)
SELECT
date,
dimension,
ROW_NUMBER() OVER (PARTITION BY dimension, step ORDER BY step_1 DESC) AS dim_order,
value,
step,
CASE
WHEN step = 'step_1' THEN 1
WHEN step = 'step_2' THEN 2
WHEN step = 'step_3' THEN 3
WHEN step = 'step_4' THEN 4
WHEN step = 'step_5' THEN 5
WHEN step = 'step_6' THEN 6
ELSE null
END AS step_order,
CASE
WHEN step = 'step_1' THEN step_2
WHEN step = 'step_2' THEN step_3
WHEN step = 'step_3' THEN step_4
WHEN step = 'step_4' THEN step_5
WHEN step = 'step_5' THEN step_6
WHEN step = 'step_6' THEN null
ELSE null
END AS next_step_sessions,
step_1,
step_2,
step_3,
step_4,
step_5,
step_6,
step_sessions
FROM p5_funnel
Run Code Online (Sandbox Code Playgroud)
我想我已经找到了(2)的答案。
我对(1)仍然不清楚,但我仍然感兴趣。我只能推测这是由于 BigQuery 的内部运作造成的。
我对 (1) 的解决方案使用不同的查询策略。从某种意义上说,它通过首先使用 CROSS JOIN 收集所需维度的网格,然后简单地 JOIN 到数据集并修剪不需要的内容,将繁重的工作与体操分开。
看起来是这样的:
WITH p0_funnel AS (
SELECT
date,
platform_type,
platform,
flow,
step_1,
step_2,
step_3,
step_4,
step_5,
step_6s AS step_6
FROM `project.dataset.source`
), p1_funnel AS (
SELECT
flow,
platform_type,
platform,
SUM(step_1) AS step_1,
SUM(step_2) AS step_2,
SUM(step_3) AS step_3,
SUM(step_4) AS step_4,
SUM(step_5) AS step_5,
SUM(step_6) AS step_6
FROM p0_funnel
GROUP BY
flow,
platform_type,
platform
), dimension_values AS (
SELECT DISTINCT
flow,
platform_type,
platform
FROM p1_funnel
), dimension_names AS (
SELECT 'platform_type' AS dimension UNION ALL
SELECT 'platform' UNION ALL
SELECT 'flow'
), steps AS (
SELECT 'step_1' AS step UNION ALL
SELECT 'step_2' UNION ALL
SELECT 'step_3' UNION ALL
SELECT 'step_4' UNION ALL
SELECT 'step_5' UNION ALL
SELECT 'step_6'
), full_grid AS (
SELECT
dimension,
step,
flow,
platform_type,
platform
FROM dimension_values
CROSS JOIN dimension_names
CROSS JOIN steps
)
SELECT
dimension,
ROW_NUMBER() OVER (PARTITION BY dimension, step ORDER BY step_1 DESC) AS dim_order,
CASE
WHEN dimension = 'platform_type' THEN platform_type
WHEN dimension = 'platform' THEN platform
WHEN dimension = 'flow' THEN flow
ELSE NULL END AS dim_value,
step,
CASE
WHEN step = 'step_1' THEN 1
WHEN step = 'step_2' THEN 2
WHEN step = 'step_3' THEN 3
WHEN step = 'step_4' THEN 4
WHEN step = 'step_5' THEN 5
WHEN step = 'step_6' THEN 6
ELSE null
END AS step_order,
CASE
WHEN step = 'step_1' THEN step_1
WHEN step = 'step_2' THEN step_2
WHEN step = 'step_3' THEN step_3
WHEN step = 'step_4' THEN step_4
WHEN step = 'step_5' THEN step_5
WHEN step = 'step_6' THEN step_6
ELSE null
END AS step_sessions,
CASE
WHEN step = 'step_1' THEN step_2
WHEN step = 'step_2' THEN step_3
WHEN step = 'step_3' THEN step_4
WHEN step = 'step_4' THEN step_5
WHEN step = 'step_5' THEN step_6
WHEN step = 'step_6' THEN null
ELSE null
END AS next_step_sessions
FROM full_grid
JOIN p1_funnel USING(platform_type, platform, flow)
Run Code Online (Sandbox Code Playgroud)
当在正确的地方使用时,CROSS JOIN 似乎非常有用,尽管我仍然不太确定为什么 UNION ALL 无法完成这项工作。不言而喻,出现这种情况是有技术原因的,我很想知道为什么,但至少有一个针对这种情况和其他类似情况的解决方法。
| 归档时间: |
|
| 查看次数: |
2975 次 |
| 最近记录: |