Moh*_*raf 1 python upload amazon-s3 pre-signed-url aws-cli
我正在尝试使用预先签名的 URL 在我的 s3 存储桶中上传一个文件,它运行良好并将数据成功上传到存储桶,但是,我上传的文件非常大,我需要能够显示进度条。我在 StackOverflow 和其他博客文章上尝试了许多可用的解决方案,但似乎没有任何帮助。
以下是使用预签名 URL 将数据上传到 s3 的代码片段。
object_name = 'DataSet.csv'
response = create_presigned_post("mybucket_name",object_name)
fields = response['fields']
with open(object_name, 'rb') as f:
files = {'file': (object_name, f)}
http_response = requests.post(response['url'], data=fields, files=files,stream=True)
print (http_response.status_code)
Run Code Online (Sandbox Code Playgroud)
它返回用于成功上传的204状态。
现在,我可以对此代码进行哪些更改以显示进度条。
PS 我尝试stream=True过请求不起作用。我曾尝试使用tqdm迭代响应,但在这种情况下也不起作用。
我认为没有办法通过使用presignedUrl默认的HTTP POST请求协议上传大文件来做到这一点。您可以通过使用 AWS S3 的分段上传机制来实现这一点。通过这种方式,您可以了解上传的每个部分并据此计算进度。我创建了一个帖子,其中包含使用分段上传和presignedUrl(打字稿)https://www.altostra.com/blog/multipart-uploads-with-s3-presigned-url 的代码片段
下面的代码对于Python来说可以正常工作,我在这里找到了它
import logging
import argparse
from boto3 import Session
import requests
logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
class S3MultipartUploadUtil:
"""
AWS S3 Multipart Upload Uril
"""
def __init__(self, session: Session):
self.session = session
self.s3 = session.client('s3')
self.upload_id = None
self.bucket_name = None
self.key = None
def start(self, bucket_name: str, key: str):
"""
Start Multipart Upload
:param bucket_name:
:param key:
:return:
"""
self.bucket_name = bucket_name
self.key = key
res = self.s3.create_multipart_upload(Bucket=bucket_name, Key=key)
self.upload_id = res['UploadId']
logger.debug(f"Start multipart upload '{self.upload_id}'")
def create_presigned_url(self, part_no: int, expire: int=3600) -> str:
"""
Create pre-signed URL for upload part.
:param part_no:
:param expire:
:return:
"""
signed_url = self.s3.generate_presigned_url(
ClientMethod='upload_part',
Params={'Bucket': self.bucket_name,
'Key': self.key,
'UploadId': self.upload_id,
'PartNumber': part_no},
ExpiresIn=expire)
logger.debug(f"Create presigned url for upload part '{signed_url}'")
return signed_url
def complete(self, parts):
"""
Complete Multipart Uploading.
`parts` is list of dictionary below.
```
[ {'ETag': etag, 'PartNumber': 1}, {'ETag': etag, 'PartNumber': 2}, ... ]
```
you can get `ETag` from upload part response header.
:param parts: Sent part info.
:return:
"""
res = self.s3.complete_multipart_upload(
Bucket=self.bucket_name,
Key=self.key,
MultipartUpload={
'Parts': parts
},
UploadId=self.upload_id
)
logger.debug(f"Complete multipart upload '{self.upload_id}'")
logger.debug(res)
self.upload_id = None
self.bucket_name = None
self.key = None
def main():
parser = argparse.ArgumentParser()
parser.add_argument('target_file')
parser.add_argument('--bucket', required=True)
args = parser.parse_args()
target_file = Path(args.target_file)
bucket_name = args.bucket
key = target_file.name
max_size = 5 * 1024 * 1024
file_size = target_file.stat().st_size
upload_by = int(file_size / max_size) + 1
session = Session()
s3util = S3MultipartUploadUtil(session)
s3util.start(bucket_name, key)
urls = []
for part in range(1, upload_by + 1):
signed_url = s3util.create_presigned_url(part)
urls.append(signed_url)
parts = []
with target_file.open('rb') as fin:
for num, url in enumerate(urls):
part = num + 1
file_data = fin.read(max_size)
print(f"upload part {part} size={len(file_data)}")
res = requests.put(url, data=file_data)
print(res)
if res.status_code != 200:
return
etag = res.headers['ETag']
parts.append({'ETag': etag, 'PartNumber': part})
print(parts)
s3util.complete(parts)
if __name__ == '__main__':
main()
Run Code Online (Sandbox Code Playgroud)