使用Python请求"桥接"文件而不加载到内存中？

Question

使用Python请求"桥接"文件而不加载到内存中？

erg*_*elo 10 python python-2.7 python-requests

我想使用Python Requests库从URL获取文件,并在post请求中将其用作mulitpart编码文件.问题是该文件可能非常大(50MB-2GB),我不想将其加载到内存中.(上下文这里.)

下面的文档中的示例(multipart,stream down和stream up)我做了这样的事情:

    with requests.get(big_file_url, stream=True) as f:
        requests.post(upload_url, files={'file': ('filename', f.content)})

Run Code Online (Sandbox Code Playgroud)

但我不确定我做得对.它实际上是抛出这个错误 - 从追溯中编辑:

    with requests.get(big_file_url, stream=True) as f:
    AttributeError: __exit__

Run Code Online (Sandbox Code Playgroud)

有什么建议？

Answer 1

jfs*_*jfs 5

正如其他答案已经指出的那样：requests不支持 POSTing 多部分编码文件而不将其加载到内存中。

要上传大文件而不使用 multipart/form-data 将其加载到内存中，您可以使用poster：

#!/usr/bin/env python
import sys
from urllib2 import Request, urlopen

from poster.encode import multipart_encode # $ pip install poster
from poster.streaminghttp import register_openers

register_openers() # install openers globally

def report_progress(param, current, total):
    sys.stderr.write("\r%03d%% of %d" % (int(1e2*current/total + .5), total))

url = 'http://example.com/path/'
params = {'file': open(sys.argv[1], "rb"), 'name': 'upload test'}
response = urlopen(Request(url, *multipart_encode(params, cb=report_progress)))
print response.read()

Run Code Online (Sandbox Code Playgroud)

它可以进行调整以允许 GET 响应对象而不是本地文件：

import posixpath
import sys
from urllib import unquote
from urllib2 import Request, urlopen
from urlparse import urlsplit

from poster.encode import MultipartParam, multipart_encode # pip install poster
from poster.streaminghttp import register_openers

register_openers() # install openers globally

class MultipartParamNoReset(MultipartParam):
    def reset(self):
        pass # do nothing (to allow self.fileobj without seek() method)

get_url = 'http://example.com/bigfile'
post_url = 'http://example.com/path/'

get_response = urlopen(get_url)
param = MultipartParamNoReset(
    name='file',
    filename=posixpath.basename(unquote(urlsplit(get_url).path)), #XXX \ bslash
    filetype=get_response.headers['Content-Type'],
    filesize=int(get_response.headers['Content-Length']),
    fileobj=get_response)

params = [('name', 'upload test'), param]
datagen, headers = multipart_encode(params, cb=report_progress)
post_response = urlopen(Request(post_url, datagen, headers))
print post_response.read()

Run Code Online (Sandbox Code Playgroud)

Content-Length此解决方案需要GET 响应中的有效标头（已知文件大小）。如果文件大小未知，则可以使用分块传输编码来上传多部分/表单数据内容。可以使用库urllib3.filepost附带的解决方案来实现类似的解决方案requests，例如，基于@AdrienF 的答案，而不使用poster.

Answer 2

Adr*_*enF 1

实际上，Kenneth Reitz 的 GitHub存储库上有一个与此相关的问题。我遇到了同样的问题（尽管我只是上传本地文件），我添加了一个包装类，它是与请求的不同部分相对应的流列表，并带有一个迭代列表的 read() 属性，读取每个部分，并获取标题的必要值（边界和内容长度）：

# coding=utf-8

from __future__ import unicode_literals
from mimetools import choose_boundary
from requests.packages.urllib3.filepost import iter_fields, get_content_type
from io import BytesIO
import codecs

writer = codecs.lookup('utf-8')[3]

class MultipartUploadWrapper(object):

    def __init__(self, files):
        """
        Initializer

        :param files:
            A dictionary of files to upload, of the form {'file': ('filename', <file object>)}
        :type network_down_callback:
            Dict
        """
        super(MultipartUploadWrapper, self).__init__()
        self._cursor = 0
        self._body_parts = None
        self.content_type_header = None
        self.content_length_header = None
        self.create_request_parts(files)

    def create_request_parts(self, files):
        request_list = []
        boundary = choose_boundary()
        content_length = 0

        boundary_string = b'--%s\r\n' % (boundary)
        for fieldname, value in iter_fields(files):
            content_length += len(boundary_string)

            if isinstance(value, tuple):
                filename, data = value
                content_disposition_string = (('Content-Disposition: form-data; name="%s"; ''filename="%s"\r\n' % (fieldname, filename))
                                            + ('Content-Type: %s\r\n\r\n' % (get_content_type(filename))))

            else:
                data = value
                content_disposition_string =  (('Content-Disposition: form-data; name="%s"\r\n' % (fieldname))
                                            + 'Content-Type: text/plain\r\n\r\n')
            request_list.append(BytesIO(str(boundary_string + content_disposition_string)))
            content_length += len(content_disposition_string)
            if hasattr(data, 'read'):
                data_stream = data
            else:
                data_stream = BytesIO(str(data))

            data_stream.seek(0,2)
            data_size = data_stream.tell()
            data_stream.seek(0)

            request_list.append(data_stream)
            content_length += data_size

            end_string = b'\r\n'
            request_list.append(BytesIO(end_string))
            content_length += len(end_string)

        request_list.append(BytesIO(b'--%s--\r\n' % (boundary)))
        content_length += len(boundary_string)

        # There's a bug in httplib.py that generates a UnicodeDecodeError on binary uploads if
        # there are *any* unicode strings passed into headers as part of the requests call.
        # For this reason all strings are explicitly converted to non-unicode at this point.
        self.content_type_header = {b'Content-Type': b'multipart/form-data; boundary=%s' % boundary}
        self.content_length_header = {b'Content-Length': str(content_length)}
        self._body_parts = request_list

    def read(self, chunk_size=0):
        remaining_to_read = chunk_size
        output_array = []
        while remaining_to_read > 0:
            body_part = self._body_parts[self._cursor]
            current_piece = body_part.read(remaining_to_read)
            length_read = len(current_piece)
            output_array.append(current_piece)
            if length_read < remaining_to_read:
                # we finished this piece but haven't read enough, moving on to the next one
                remaining_to_read -= length_read
                if self._cursor == len(self._body_parts) - 1:
                    break
                else:
                    self._cursor += 1
            else:
                break
        return b''.join(output_array)

Run Code Online (Sandbox Code Playgroud)

因此，您可以将此对象作为“data”属性传递给 Request.request 对象，而不是传递“files”关键字arg

编辑

我已经清理了代码

归档时间：	12 年，9 月前
查看次数：	3607 次
最近记录：	7 年，5 月前