Cha*_*les 35 python sqlalchemy pandas
我有一些相当大的pandas DataFrames,我想使用新的批量SQL映射通过SQL Alchemy将它们上传到Microsoft SQL Server.pandas.to_sql方法虽然不错,但速度很慢.
我在编写代码时遇到了麻烦......
我希望能够将这个函数传递给我正在调用的pandas DataFrame,我正在调用table的模式名称schema,以及我正在调用的表名name.理想情况下,该函数将1.)删除表,如果它已经存在.2.)创建一个新表3.)创建一个mapper和4.)使用mapper和pandas数据批量插入.我被困在第3部分.
这是我的(诚然粗糙的)代码.我正在努力解决如何让mapper函数与我的主键一起工作.我真的不需要主键,但映射器功能需要它.
感谢您的见解.
from sqlalchemy import create_engine Table, Column, MetaData
from sqlalchemy.orm import mapper, create_session
from sqlalchemy.ext.declarative import declarative_base
from pandas.io.sql import SQLTable, SQLDatabase
def bulk_upload(table, schema, name):
e = create_engine('mssql+pyodbc://MYDB')
s = create_session(bind=e)
m = MetaData(bind=e,reflect=True,schema=schema)
Base = declarative_base(bind=e,metadata=m)
t = Table(name,m)
m.remove(t)
t.drop(checkfirst=True)
sqld = SQLDatabase(e, schema=schema,meta=m)
sqlt = SQLTable(name, sqld, table).table
sqlt.metadata = m
m.create_all(bind=e,tables=[sqlt])
class MyClass(Base):
return
mapper(MyClass, sqlt)
s.bulk_insert_mappings(MyClass, table.to_dict(orient='records'))
return
Run Code Online (Sandbox Code Playgroud)
ans*_*onw 28
我遇到了类似的问题,pd.to_sql需要花费数小时才能上传数据.以下代码批量在几秒钟内插入相同的数据.
from sqlalchemy import create_engine
import psycopg2 as pg
#load python script that batch loads pandas df to sql
import cStringIO
address = 'postgresql://<username>:<pswd>@<host>:<port>/<database>'
engine = create_engine(address)
connection = engine.raw_connection()
cursor = connection.cursor()
#df is the dataframe containing an index and the columns "Event" and "Day"
#create Index column to use as primary key
df.reset_index(inplace=True)
df.rename(columns={'index':'Index'}, inplace =True)
#create the table but first drop if it already exists
command = '''DROP TABLE IF EXISTS localytics_app2;
CREATE TABLE localytics_app2
(
"Index" serial primary key,
"Event" text,
"Day" timestamp without time zone,
);'''
cursor.execute(command)
connection.commit()
#stream the data using 'to_csv' and StringIO(); then use sql's 'copy_from' function
output = cStringIO.StringIO()
#ignore the index
df.to_csv(output, sep='\t', header=False, index=False)
#jump to start of stream
output.seek(0)
contents = output.getvalue()
cur = connection.cursor()
#null values become ''
cur.copy_from(output, 'localytics_app2', null="")
connection.commit()
cur.close()
Run Code Online (Sandbox Code Playgroud)
Aka*_*njo 16
这可能已经得到了回答,但我通过在此站点上整理不同的答案并与SQLAlchemy的doc对齐来找到解决方案.
希望这有助于任何来到这里并想要快速混合Panda和SQLAlchemy的人.
from urllib import quote_plus as urlquote
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, Numeric
from sqlalchemy.orm import sessionmaker
import pandas as pd
# Set up of the engine to connect to the database
# the urlquote is used for passing the password which might contain special characters such as "/"
engine = create_engine('mysql://root:%s@localhost/db1' % urlquote('weirdPassword*withsp€cialcharacters'), echo=False)
conn = engine.connect()
Base = declarative_base()
#Declaration of the class in order to write into the database. This structure is standard and should align with SQLAlchemy's doc.
class Current(Base):
__tablename__ = 'tableName'
id = Column(Integer, primary_key=True)
Date = Column(String(500))
Type = Column(String(500))
Value = Column(Numeric())
def __repr__(self):
return "(id='%s', Date='%s', Type='%s', Value='%s')" % (self.id, self.Date, self.Type, self.Value)
# Set up of the table in db and the file to import
fileToRead = 'file.csv'
tableToWriteTo = 'tableName'
# Panda to create a lovely dataframe
df_to_be_written = pd.read_csv(fileToRead)
# The orient='records' is the key of this, it allows to align with the format mentioned in the doc to insert in bulks.
listToWrite = df_to_be_written.to_dict(orient='records')
metadata = sqlalchemy.schema.MetaData(bind=engine,reflect=True)
table = sqlalchemy.Table(tableToWriteTo, metadata, autoload=True)
# Open the session
Session = sessionmaker(bind=engine)
session = Session()
# Inser the dataframe into the database in one bulk
conn.execute(table.insert(), listToWrite)
# Commit the changes
session.commit()
# Close the session
session.close()
Run Code Online (Sandbox Code Playgroud)
Fab*_*les 15
基于@ansonw答案:
def to_sql(engine, df, table, if_exists='fail', sep='\t', encoding='utf8'):
# Create Table
df[:0].to_sql(table, engine, if_exists=if_exists)
# Prepare data
output = cStringIO.StringIO()
df.to_csv(output, sep=sep, header=False, encoding=encoding)
output.seek(0)
# Insert data
connection = engine.raw_connection()
cursor = connection.cursor()
cursor.copy_from(output, table, sep=sep, null='')
connection.commit()
cursor.close()
Run Code Online (Sandbox Code Playgroud)
我在5秒而不是4分钟内插入200000行
小智 10
Pandas 0.25.1 有一个参数可以进行多次插入,因此不再需要使用 SQLAlchemy 来解决这个问题。
method='multi'调用时设置pandas.DataFrame.to_sql。
在这个例子中,它将是
df.to_sql(table, schema=schema, con=e, index=False, if_exists='replace', method='multi')
答案来自此处的文档
值得注意的是,我只用 Redshift 对此进行了测试。请让我知道它在其他数据库上的运行情况,以便我可以更新此答案。
由于这是 I/O 繁重的工作负载,您还可以通过multiprocessing.dummy使用 python 线程模块。这对我来说加快了速度:
import math
from multiprocessing.dummy import Pool as ThreadPool
...
def insert_df(df, *args, **kwargs):
nworkers = 4
chunksize = math.floor(df.shape[0] / nworkers)
chunks = [(chunksize * i, (chunksize * i) + chunksize) for i in range(nworkers)]
chunks.append((chunksize * nworkers, df.shape[0]))
pool = ThreadPool(nworkers)
def worker(chunk):
i, j = chunk
df.iloc[i:j, :].to_sql(*args, **kwargs)
pool.map(worker, chunks)
pool.close()
pool.join()
....
insert_df(df, "foo_bar", engine, if_exists='append')
Run Code Online (Sandbox Code Playgroud)
下面我的 postgres 特定解决方案使用 pandas 数据框自动创建数据库表,并使用 postgres 执行快速批量插入COPY my_table FROM ...
import io
import pandas as pd
from sqlalchemy import create_engine
def write_to_table(df, db_engine, schema, table_name, if_exists='fail'):
string_data_io = io.StringIO()
df.to_csv(string_data_io, sep='|', index=False)
pd_sql_engine = pd.io.sql.pandasSQL_builder(db_engine, schema=schema)
table = pd.io.sql.SQLTable(table_name, pd_sql_engine, frame=df,
index=False, if_exists=if_exists, schema=schema)
table.create()
string_data_io.seek(0)
string_data_io.readline() # remove header
with db_engine.connect() as connection:
with connection.connection.cursor() as cursor:
copy_cmd = "COPY %s.%s FROM STDIN HEADER DELIMITER '|' CSV" % (schema, table_name)
cursor.copy_expert(copy_cmd, string_data_io)
connection.connection.commit()
Run Code Online (Sandbox Code Playgroud)
对于像我这样尝试实施上述解决方案的人:
Pandas 0.24.0 现在有 to_sql ,带有 chunksize 和 method='multi' 选项,可以批量插入...
.
https://www.microsoft.com/en-us/download/details.aspx?id=56567
from sqlalchemy import create_engine
import urllib
server = '*****'
database = '********'
username = '**********'
password = '*********'
params = urllib.parse.quote_plus(
'DRIVER={ODBC Driver 17 for SQL Server};'+
'SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password)
engine = create_engine("mssql+pyodbc:///?odbc_connect=%s" % params)
#Checking Connection
connected = pd.io.sql._is_sqlalchemy_connectable(engine)
print(connected) #Output is True if connection established successfully
Run Code Online (Sandbox Code Playgroud)
df.to_sql('Table_Name', con=engine, if_exists='append', index=False)
"""
if_exists: {'fail', 'replace', 'append'}, default 'fail'
fail: If table exists, do nothing.
replace: If table exists, drop it, recreate it, and insert data.
append: If table exists, insert data. Create if does not exist.
"""
Run Code Online (Sandbox Code Playgroud)
如果有很多记录
# limit based on sp_prepexec parameter count
tsql_chunksize = 2097 // len(bd_pred_score_100.columns)
# cap at 1000 (limit for number of rows inserted by table-value constructor)
tsql_chunksize = 1000 if tsql_chunksize > 1000 else tsql_chunksize
print(tsql_chunksize)
df.to_sql('table_name', con = engine, if_exists = 'append', index= False, chunksize=tsql_chunksize)
Run Code Online (Sandbox Code Playgroud)
PS:您可以根据需要更改参数。
| 归档时间: |
|
| 查看次数: |
45035 次 |
| 最近记录: |