aba*_*ter 2 python pandas google-bigquery
我有一个 pandas 数据框,我也已将其写入文件。我还为 json 格式的数据创建了一个架构。我将其存储为 python 字典,并写入文件。
我尝试过使用to_gpq命令行上传,在这两种情况下,我都收到有关重复字段(同一字段)的错误。
这是有关数据的信息:
代码
df.shape
len(clinvar_variant_schema)
schema_fields = [x['name'] for x in clinvar_variant_schema]
schema_fields.sort()
json.dumps(schema_fields)
colnames = df.columns.tolist()
colnames.sort()
json.dumps(colnames)
set(schema_fields).difference(set(colnames))
set(colnames).difference(set(schema_fields))
Run Code Online (Sandbox Code Playgroud)
输出
(1000, 24)
24
'["AF_ESP", "AF_EXAC", "AF_TGP", "ALLELEID", "ALT", "CHROM", "CLNDISDB", "CLNDN", "CLNHGVS", "CLNREVSTAT", "CLNSIG", "CLNSIGCONF", "CLNVC", "CLNVCSO", "CLNVI", "FILTER", "GENEINFO", "ID", "MC", "ORIGIN", "POS", "QUAL", "REF", "RS"]'
'["AF_ESP", "AF_EXAC", "AF_TGP", "ALLELEID", "ALT", "CHROM", "CLNDISDB", "CLNDN", "CLNHGVS", "CLNREVSTAT", "CLNSIG", "CLNSIGCONF", "CLNVC", "CLNVCSO", "CLNVI", "FILTER", "GENEINFO", "ID", "MC", "ORIGIN", "POS", "QUAL", "REF", "RS"]'
set()
set()
Colnames Schema_Names
0 AF_ESP AF_ESP
1 AF_EXAC AF_EXAC
2 AF_TGP AF_TGP
3 ALLELEID ALLELEID
4 ALT ALT
5 CHROM CHROM
6 CLNDISDB CLNDISDB
7 CLNDN CLNDN
8 CLNHGVS CLNHGVS
9 CLNREVSTAT CLNREVSTAT
10 CLNSIG CLNSIG
11 CLNSIGCONF CLNSIGCONF
12 CLNVC CLNVC
13 CLNVCSO CLNVCSO
14 CLNVI CLNVI
15 FILTER FILTER
16 GENEINFO GENEINFO
17 ID ID
18 MC MC
19 ORIGIN ORIGIN
20 POS POS
21 QUAL QUAL
22 REF REF
23 RS RS
Run Code Online (Sandbox Code Playgroud)
project_id = "my_project_id"
table_id = "my_dataset.clinvar_vcf"
df.to_gbq(
destination_table = table_id,
project_id = project_id,
if_exists = "replace",
table_schema = clinvar_variant_schema
)
Run Code Online (Sandbox Code Playgroud)
IPython will make a temporary file named: /tmp/ipython_edit_pw1v55fy/ipython_edit_k3v1q7m5.py
1it [00:00, 2.93it/s]
---------------------------------------------------------------------------
InvalidResponse Traceback (most recent call last)
/opt/conda/lib/python3.7/site-packages/google/cloud/bigquery/client.py in load_table_from_file(self, file_obj, destination, rewind, size, num_retries, job_id, job_id_prefix, location, project, job_config)
1829 response = self._do_resumable_upload(
-> 1830 file_obj, job_resource, num_retries
1831 )
/opt/conda/lib/python3.7/site-packages/google/cloud/bigquery/client.py in _do_resumable_upload(self, stream, metadata, num_retries)
2153 upload, transport = self._initiate_resumable_upload(
-> 2154 stream, metadata, num_retries
2155 )
/opt/conda/lib/python3.7/site-packages/google/cloud/bigquery/client.py in _initiate_resumable_upload(self, stream, metadata, num_retries)
2195 upload.initiate(
-> 2196 transport, stream, metadata, _GENERIC_CONTENT_TYPE, stream_final=False
2197 )
/opt/conda/lib/python3.7/site-packages/google/resumable_media/requests/upload.py in initiate(self, transport, stream, metadata, content_type, total_bytes, stream_final, timeout)
411 )
--> 412 self._process_initiate_response(response)
413 return response
/opt/conda/lib/python3.7/site-packages/google/resumable_media/_upload.py in _process_initiate_response(self, response)
505 self._get_status_code,
--> 506 callback=self._make_invalid,
507 )
/opt/conda/lib/python3.7/site-packages/google/resumable_media/_helpers.py in require_status_code(response, status_codes, get_status_code, callback)
110 u"Expected one of",
--> 111 *status_codes
112 )
InvalidResponse: ('Request failed with status code', 400, 'Expected one of', <HTTPStatus.OK: 200>, <HTTPStatus.CREATED: 201>)
During handling of the above exception, another exception occurred:
BadRequest Traceback (most recent call last)
/opt/conda/lib/python3.7/site-packages/pandas_gbq/gbq.py in load_data(self, dataframe, dataset_id, table_id, chunksize, schema, progress_bar)
628 chunks = tqdm.tqdm(chunks)
--> 629 for remaining_rows in chunks:
630 logger.info(
/opt/conda/lib/python3.7/site-packages/tqdm/std.py in __iter__(self)
1173 try:
-> 1174 for obj in iterable:
1175 yield obj
/opt/conda/lib/python3.7/site-packages/pandas_gbq/load.py in load_chunks(client, dataframe, dataset_id, table_id, chunksize, schema, location)
81 job_config=job_config,
---> 82 location=location,
83 ).result()
/opt/conda/lib/python3.7/site-packages/google/cloud/bigquery/client.py in load_table_from_file(self, file_obj, destination, rewind, size, num_retries, job_id, job_id_prefix, location, project, job_config)
1836 except resumable_media.InvalidResponse as exc:
-> 1837 raise exceptions.from_http_response(exc.response)
1838
BadRequest: 400 POST https://bigquery.googleapis.com/upload/bigquery/v2/projects/eacri-genomics/jobs?uploadType=resumable: Cannot load CSV data with a repeated field. Field: FILTER
During handling of the above exception, another exception occurred:
GenericGBQException Traceback (most recent call last)
<ipython-input-55-19cb6dc0a4ee> in <module>
6 project_id = project_id,
7 if_exists = "replace",
----> 8 table_schema = clinvar_variant_schema
9 )
/opt/conda/lib/python3.7/site-packages/pandas/core/frame.py in to_gbq(self, destination_table, project_id, chunksize, reauth, if_exists, auth_local_webserver, table_schema, location, progress_bar, credentials)
1655 location=location,
1656 progress_bar=progress_bar,
-> 1657 credentials=credentials,
1658 )
1659
/opt/conda/lib/python3.7/site-packages/pandas/io/gbq.py in to_gbq(dataframe, destination_table, project_id, chunksize, reauth, if_exists, auth_local_webserver, table_schema, location, progress_bar, credentials, verbose, private_key)
226 credentials=credentials,
227 verbose=verbose,
--> 228 private_key=private_key,
229 )
/opt/conda/lib/python3.7/site-packages/pandas_gbq/gbq.py in to_gbq(dataframe, destination_table, project_id, chunksize, reauth, if_exists, auth_local_webserver, table_schema, location, progress_bar, credentials, verbose, private_key)
1206 chunksize=chunksize,
1207 schema=table_schema,
-> 1208 progress_bar=progress_bar,
1209 )
1210
/opt/conda/lib/python3.7/site-packages/pandas_gbq/gbq.py in load_data(self, dataframe, dataset_id, table_id, chunksize, schema, progress_bar)
634 )
635 except self.http_error as ex:
--> 636 self.process_http_error(ex)
637
638 def schema(self, dataset_id, table_id):
/opt/conda/lib/python3.7/site-packages/pandas_gbq/gbq.py in process_http_error(ex)
433 # <https://cloud.google.com/bigquery/troubleshooting-errors>`__
434
--> 435 raise GenericGBQException("Reason: {0}".format(ex))
436
437 def run_query(
GenericGBQException: Reason: 400 POST https://bigquery.googleapis.com/upload/bigquery/v2/projects/eacri-genomics/jobs?uploadType=resumable: Cannot load CSV data with a repeated field. Field: FILTER
Run Code Online (Sandbox Code Playgroud)
bqbq load --source_format=CSV --field_delimiter=tab --replace --schema clinvar_variant_schema.json my_project_id:my_dataset.clinvar_vcf clinvar_expanded_vcf
Run Code Online (Sandbox Code Playgroud)
BigQuery error in load operation: Cannot load CSV data with a repeated field.
Field: FILTER
---------------------------------------------------------------------------
CalledProcessError Traceback (most recent call last)
<ipython-input-45-be7b46a6694c> in <module>
----> 1 get_ipython().run_cell_magic('bash', '', 'bq load --source_format=CSV --field_delimiter=tab --replace --schema clinvar_variant_schema.json eacri-genomics:clinvar_9302020.clinvar_vcf clinvar_expanded_vcf\n')
/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py in run_cell_magic(self, magic_name, line, cell)
2379 with self.builtin_trap:
2380 args = (magic_arg_s, cell)
-> 2381 result = fn(*args, **kwargs)
2382 return result
2383
/opt/conda/lib/python3.7/site-packages/IPython/core/magics/script.py in named_script_magic(line, cell)
140 else:
141 line = script
--> 142 return self.shebang(line, cell)
143
144 # write a basic docstring:
<decorator-gen-103> in shebang(self, line, cell)
/opt/conda/lib/python3.7/site-packages/IPython/core/magic.py in <lambda>(f, *a, **k)
185 # but it's overkill for just that one bit of state.
186 def magic_deco(arg):
--> 187 call = lambda f, *a, **k: f(*a, **k)
188
189 if callable(arg):
/opt/conda/lib/python3.7/site-packages/IPython/core/magics/script.py in shebang(self, line, cell)
243 sys.stderr.flush()
244 if args.raise_error and p.returncode!=0:
--> 245 raise CalledProcessError(p.returncode, cell, output=out, stderr=err)
246
247 def _run_script(self, p, cell, to_close):
CalledProcessError: Command 'b'bq load --source_format=CSV --field_delimiter=tab --replace --schema clinvar_variant_schema.json eacri-genomics:clinvar_9302020.clinvar_vcf clinvar_expanded_vcf\n'' returned non-zero exit status 1.
Run Code Online (Sandbox Code Playgroud)