Fel*_*ffa 5 sql google-bigquery
我正在计算在 Stack Overflow 上得到回复的平均时间,结果毫无意义。
#standardSQL
WITH question_answers AS (
SELECT *
, timestamp_diff(answers.first, creation_date, minute) minutes
FROM (
SELECT creation_date
, (SELECT AS STRUCT MIN(creation_date) first, COUNT(*) c
FROM `bigquery-public-data.stackoverflow.posts_answers` b
WHERE a.id=b.parent_id
) answers
, SPLIT(tags, '|') tags
FROM `bigquery-public-data.stackoverflow.posts_questions` a
WHERE EXTRACT(year FROM creation_date) > 2015
), UNNEST(tags) tag
WHERE tag IN ('java', 'javascript', 'google-bigquery', 'firebase', 'php')
AND answers.c > 0
)
SELECT tag
, COUNT(*) questions
, ROUND(AVG(minutes), 2) first_reply_avg_minutes
FROM question_answers
GROUP BY tag
Run Code Online (Sandbox Code Playgroud)
我应该如何计算平均时间?
2019 年更新:分享一些持久化的公共 UDF怎么样?
第一个,中位数:
SELECT fhoffa.x.median([1,1,1,2,3,4,5,100,1000])
3.0
Run Code Online (Sandbox Code Playgroud)
确实 - 在 Stack Overflow 上获得答案的平均时间超过 100 小时(>6000 分钟)似乎是错误的 - 并且主要是由异常值驱动的。
而不是做一个简单的AVG()你可以得到:
EXP(AVG(LOG(GREATEST(minutes,1))))AVG(q) FROM (SELECT q FROM QUANTILES(q, 100) LIMIT 80 OFFSET 2)).all_minutes[OFFSET(CAST(ARRAY_LENGTH(all_minutes)/2 AS INT64))]如果您使用任何这些替代方案,结果会更有意义:
正如你在这里看到的,在这种情况下,去除异常值给我们的结果类似于几何平均值——而中值报告的数字甚至更低。使用哪一种?你的选择。
WITH question_answers AS (
SELECT *
, timestamp_diff(answers.first, creation_date, minute) minutes
FROM (
SELECT creation_date
, (SELECT AS STRUCT MIN(creation_date) first, COUNT(*) c
FROM `bigquery-public-data.stackoverflow.posts_answers` b
WHERE a.id=b.parent_id
) answers
, SPLIT(tags, '|') tags
FROM `bigquery-public-data.stackoverflow.posts_questions` a
WHERE EXTRACT(year FROM creation_date) > 2015
), UNNEST(tags) tag
WHERE tag IN ('java', 'javascript', 'google-bigquery', 'firebase', 'php', 'sql', 'elasticsearch', 'apache-kafka', 'tensorflow')
AND answers.c > 0
)
SELECT * EXCEPT(qs, all_minutes)
, (SELECT ROUND(AVG(q),2) FROM (SELECT q FROM UNNEST(qs) q ORDER BY q LIMIT 80 OFFSET 2)) avg_no_outliers
, all_minutes[OFFSET(CAST(ARRAY_LENGTH(all_minutes)/2 AS INT64) )] median_minutes
FROM (
SELECT tag
, COUNT(*) questions
, ROUND(AVG(minutes), 2) avg_minutes
, ROUND(EXP(AVG(LOG(GREATEST(minutes,1)))),2) first_reply_avg_minutes_geom
, APPROX_QUANTILES(minutes, 100) qs
, ARRAY_AGG(minutes IGNORE NULLS ORDER BY minutes) all_minutes
FROM question_answers
GROUP BY tag
)
ORDER BY 2 DESC
Run Code Online (Sandbox Code Playgroud)
MEDIAN() 来自 Elliott 的额外UDF 函数。
CREATE TEMP FUNCTION MEDIAN(arr ANY TYPE) AS ((
SELECT
IF(
MOD(ARRAY_LENGTH(arr), 2) = 0,
(arr[OFFSET(DIV(ARRAY_LENGTH(arr), 2) - 1)] + arr[OFFSET(DIV(ARRAY_LENGTH(arr), 2))]) / 2,
arr[OFFSET(DIV(ARRAY_LENGTH(arr), 2))]
)
FROM (SELECT ARRAY_AGG(x ORDER BY x) AS arr FROM UNNEST(arr) AS x)
));
Run Code Online (Sandbox Code Playgroud)