Joe*_*tto 5 sql google-bigquery
使用 BigQuery,是否可以__TABLES__从项目中的每个数据集中进行选择?我试过了,SELECT * FROM '*.__TABLES'但在 BigQuery 中是不允许的。任何帮助都会很棒,谢谢!
__TABLES__ 语法仅支持特定数据集,不适用于跨数据集
你可以做的是如下
#standardSQL
WITH ALL__TABLES__ AS (
SELECT * FROM `bigquery-public-data.1000_genomes.__TABLES__` UNION ALL
SELECT * FROM `bigquery-public-data.baseball.__TABLES__` UNION ALL
SELECT * FROM `bigquery-public-data.bls.__TABLES__` UNION ALL
SELECT * FROM `bigquery-public-data.census_bureau_usa.__TABLES__` UNION ALL
SELECT * FROM `bigquery-public-data.cloud_storage_geo_index.__TABLES__` UNION ALL
SELECT * FROM `bigquery-public-data.cms_codes.__TABLES__` UNION ALL
SELECT * FROM `bigquery-public-data.common_us.__TABLES__` UNION ALL
SELECT * FROM `bigquery-public-data.fec.__TABLES__` UNION ALL
SELECT * FROM `bigquery-public-data.genomics_cannabis.__TABLES__` UNION ALL
SELECT * FROM `bigquery-public-data.ghcn_d.__TABLES__` UNION ALL
SELECT * FROM `bigquery-public-data.ghcn_m.__TABLES__` UNION ALL
SELECT * FROM `bigquery-public-data.github_repos.__TABLES__` UNION ALL
SELECT * FROM `bigquery-public-data.hacker_news.__TABLES__` UNION ALL
SELECT * FROM `bigquery-public-data.irs_990.__TABLES__` UNION ALL
SELECT * FROM `bigquery-public-data.medicare.__TABLES__` UNION ALL
SELECT * FROM `bigquery-public-data.new_york.__TABLES__` UNION ALL
SELECT * FROM `bigquery-public-data.nlm_rxnorm.__TABLES__` UNION ALL
SELECT * FROM `bigquery-public-data.noaa_gsod.__TABLES__` UNION ALL
SELECT * FROM `bigquery-public-data.open_images.__TABLES__` UNION ALL
SELECT * FROM `bigquery-public-data.samples.__TABLES__` UNION ALL
SELECT * FROM `bigquery-public-data.san_francisco.__TABLES__` UNION ALL
SELECT * FROM `bigquery-public-data.stackoverflow.__TABLES__` UNION ALL
SELECT * FROM `bigquery-public-data.usa_names.__TABLES__` UNION ALL
SELECT * FROM `bigquery-public-data.utility_us.__TABLES__`
)
SELECT *
FROM ALL__TABLES__
Run Code Online (Sandbox Code Playgroud)
在这种情况下,您需要提前知道数据集列表,您可以通过Datasets: list API或使用相应的bq ls轻松完成
请注意:上述方法仅适用于数据位于同一位置的数据集。如果您的数据集包含不同位置的数据,则需要在两个不同的查询中查询它们
例如:
#standardSQL
WITH ALL_EU__TABLES__ AS (
SELECT * FROM `bigquery-public-data.common_eu.__TABLES__` UNION ALL
SELECT * FROM `bigquery-public-data.utility_eu.__TABLES__`
)
SELECT *
FROM ALL_EU__TABLES__
Run Code Online (Sandbox Code Playgroud)
小智 7
您可以使用此 SQL 查询生成项目中的数据集列表:
select string_agg(
concat("select * from `[PROJECT ID].", schema_name, ".__TABLES__` ")
, "union all \n"
)
from `[PROJECT ID]`.INFORMATION_SCHEMA.SCHEMATA;
Run Code Online (Sandbox Code Playgroud)
您将拥有此列表:
select * from `[PROJECT ID].[DATASET ID 1].__TABLES__` union all
select * from `[PROJECT ID].[DATASET ID 2].__TABLES__` union all
select * from `[PROJECT ID].[DATASET ID 3].__TABLES__` union all
select * from `[PROJECT ID].[DATASET ID 4].__TABLES__`
...
Run Code Online (Sandbox Code Playgroud)
然后将列表放在此查询中:
SELECT
table_id
,DATE(TIMESTAMP_MILLIS(creation_time)) AS creation_date
,DATE(TIMESTAMP_MILLIS(last_modified_time)) AS last_modified_date
,row_count
,size_bytes
,round(safe_divide(size_bytes, (1000*1000)),1) as size_mb
,round(safe_divide(size_bytes, (1000*1000*1000)),2) as size_gb
,CASE
WHEN type = 1 THEN 'table'
WHEN type = 2 THEN 'view'
WHEN type = 3 THEN 'external'
ELSE '?'
END AS type
,TIMESTAMP_MILLIS(creation_time) AS creation_time
,TIMESTAMP_MILLIS(last_modified_time) AS last_modified_time
,FORMAT_TIMESTAMP("%Y-%m", TIMESTAMP_MILLIS(last_modified_time)) as last_modified_month
,dataset_id
,project_id
FROM
(
select * from `[PROJECT ID].[DATASET ID 1].__TABLES__` union all
select * from `[PROJECT ID].[DATASET ID 2].__TABLES__` union all
select * from `[PROJECT ID].[DATASET ID 3].__TABLES__` union all
select * from `[PROJECT ID].[DATASET ID 4].__TABLES__`
)
ORDER BY dataset_id, table_id asc
Run Code Online (Sandbox Code Playgroud)
我知道您要求使用 BigQuery,但我编写了一个 Python 脚本来获取您要求的信息,也许它可以帮助其他编码人员:
点安装:
!pip install google-cloud
!pip install google-api-python-client
!pip install oauth2client
Run Code Online (Sandbox Code Playgroud)
代码:
import subprocess
import sys
import threading
from google.cloud import bigquery
def _worker_query(project, dataset_id, results_scan ):
query_str = 'SELECT * FROM `{}.{}.__TABLES__`'.format(project, dataset_id )
QUERY = (query_str)
query_job = client.query(QUERY)
rows = query_job.result()
count=0;
for row in rows:
count = count+1
results_scan.append({'dataset_id':dataset_id, 'count':count})
def main_execute():
project = 'bigquery-public-data'
dataset = client.list_datasets(project)
count = 0
threads_project = []
results_scan = []
for d in dataset:
t = threading.Thread(target=_worker_query, args=(project,d.dataset_id, results_scan))
threads_project.append(t)
t.start()
for t in threads_project:
t.join()
total_count = 0
for result in results_scan:
print(result)
total_count = total_count + result['count']
print('\n\nTOTAL TABLES: "{}"'.format(total_count))
JSON_FILE_NAME = 'sa_bq.json'
client = bigquery.Client.from_service_account_json(JSON_FILE_NAME)
main_execute()
Run Code Online (Sandbox Code Playgroud)
| 归档时间: |
|
| 查看次数: |
13796 次 |
| 最近记录: |