Node.js和Amazon S3:如何遍历存储桶中的所有文件?

nab*_*nab 44 amazon-s3 node.js

Node.js是否有任何Amazon S3客户端库允许列出S3存储桶中的所有文件?

最知名的aws2jsknox似乎没有这个功能.

Mee*_*ohi 62

使用官方aws-sdk:

var allKeys = [];
function listAllKeys(marker, cb)
{
  s3.listObjects({Bucket: s3bucket, Marker: marker}, function(err, data){
    allKeys.push(data.Contents);

    if(data.IsTruncated)
      listAllKeys(data.NextMarker, cb);
    else
      cb();
  });
}
Run Code Online (Sandbox Code Playgroud)

请参阅s3.listObjects

编辑2017:相同的基本想法,但listObjectsV2( ... )现在推荐使用a ContinuationToken(参见s3.listObjectsV2):

var allKeys = [];
function listAllKeys(token, cb)
{
  var opts = { Bucket: s3bucket };
  if(token) opts.ContinuationToken = token;

  s3.listObjectsV2(opts, function(err, data){
    allKeys = allKeys.concat(data.Contents);

    if(data.IsTruncated)
      listAllKeys(data.NextContinuationToken, cb);
    else
      cb();
  });
}
Run Code Online (Sandbox Code Playgroud)

  • @kuanb确实,根据文档数据.由于Delimiter缺失,因为上面的参数,Marker将为null.请参阅下面的答案 (3认同)

Ken*_*Lin 15

这是我编写的用于从截断列表中组装S3对象的Node代码.

var params = {
    Bucket: <yourbucket>,
    Prefix: <yourprefix>,
};

var s3DataContents = [];    // Single array of all combined S3 data.Contents

function s3Print() {
    if (program.al) {
        // --al: Print all objects
        console.log(JSON.stringify(s3DataContents, null, "    "));
    } else {
        // --b: Print key only, otherwise also print index 
        var i;
        for (i = 0; i < s3DataContents.length; i++) {
            var head = !program.b ? (i+1) + ': ' : '';
            console.log(head + s3DataContents[i].Key);
        }
    }
}

function s3ListObjects(params, cb) {
    s3.listObjects(params, function(err, data) {
        if (err) {
            console.log("listS3Objects Error:", err);
        } else {
            var contents = data.Contents;
            s3DataContents = s3DataContents.concat(contents);
            if (data.IsTruncated) {
                // Set Marker to last returned key
                params.Marker = contents[contents.length-1].Key;
                s3ListObjects(params, cb);
            } else {
                cb();
            }
        }
    });
}

s3ListObjects(params, s3Print);
Run Code Online (Sandbox Code Playgroud)

请注意listObject的NextMarker文档,它并不总是存在于返回的数据对象中,因此我在上面的代码中根本不使用它...

NextMarker - (字符串)当截断响应时(响应中的IsTruncated 元素值为true),您可以使用此字段中的键名作为后续请求中的标记来获取下一组对象.Amazon S3按字母顺序列出对象注意:仅当您指定了分隔符请求参数时,才会返回此元素.如果响应不包含NextMarker并且它被截断,则可以使用响应中最后一个Key的值作为后续请求中的标记来获取下一组对象键.

整个程序现已推送到https://github.com/kenklin/s3list.


nki*_*tku 12

使用异步生成器

导入 S3

const { S3 } = require("aws-sdk");
const s3 = new S3();
Run Code Online (Sandbox Code Playgroud)

创建一个生成器函数来检索所有文件列表

async function* listAllKeys(opts) {
  opts = { ...opts };
  do {
    const data = await s3.listObjectsV2(opts).promise();
    opts.ContinuationToken = data.NextContinuationToken;
    yield data;
  } while (opts.ContinuationToken);
}
Run Code Online (Sandbox Code Playgroud)

准备 aws 参数,基于api 文档

const opts = {
  Bucket: "bucket-xyz" /* required */,
  // ContinuationToken: 'STRING_VALUE',
  // Delimiter: 'STRING_VALUE',
  // EncodingType: url,
  // FetchOwner: true || false,
  // MaxKeys: 'NUMBER_VALUE',
  // Prefix: 'STRING_VALUE',
  // RequestPayer: requester,
  // StartAfter: 'STRING_VALUE'
};
Run Code Online (Sandbox Code Playgroud)

使用发电机

async function main() {
  // using for of await loop
  for await (const data of listAllKeys(opts)) {
    console.log(data.Contents);
  }
}
main();
Run Code Online (Sandbox Code Playgroud)

就是这样

或延迟加载

async function main() {
  const keys = listAllKeys(opts);
  console.log(await keys.next());
  // {value: {…}, done: false}
  console.log(await keys.next());
  // {value: {…}, done: false}
  console.log(await keys.next());
  // {value: undefined, done: true}
}
main();
Run Code Online (Sandbox Code Playgroud)

或使用生成器制作 Observable 函数

const lister = (opts) => (o$) => {
  let needMore = true;
  const process = async () => {
    for await (const data of listAllKeys(opts)) {
      o$.next(data);
      if (!needMore) break;
    }
    o$.complete();
  };
  process();
  return () => (needMore = false);
};
Run Code Online (Sandbox Code Playgroud)

在 RXJS 中使用这个 observable 函数

// Using Rxjs

const { Observable } = require("rxjs");
const { flatMap } = require("rxjs/operators");

function listAll() {
  return Observable.create(lister(opts))
    .pipe(flatMap((v) => v.Contents))
    .subscribe(console.log);
}

listAll();
Run Code Online (Sandbox Code Playgroud)

或将此可观察函数与 Nodejs EventEmitter 一起使用

const EventEmitter = require("events");

const _eve = new EventEmitter();

async function onData(data) {
  // will be called for each set of data
  console.log(data);
}
async function onError(error) {
  // will be called if any error
  console.log(error);
}
async function onComplete() {
  // will be called when data completely received
}
_eve.on("next", onData);
_eve.on("error", onError);
_eve.on("complete", onComplete);

const stop = lister(opts)({
  next: (v) => _eve.emit("next", v),
  error: (e) => _eve.emit("error", e),
  complete: (v) => _eve.emit("complete", v),
});
Run Code Online (Sandbox Code Playgroud)

使用 Typescript 和 AWS-SDK v3 + Deno

import {
  paginateListObjectsV2,
  S3Client,
  S3ClientConfig,
} from "@aws-sdk/client-s3";

/* // For Deno
import {
  paginateListObjectsV2,
  S3Client,
  S3ClientConfig,
} from "https://deno.land/x/aws_sdk@v3.14.0.0/client-s3/mod.ts"; */

const s3Config: S3ClientConfig = {
  credentials: {
    accessKeyId: "accessKeyId",
    secretAccessKey: "secretAccessKey",
  },
  region: "us-east-1",
};

const client = new S3Client(s3Config);
const s3Opts = { Bucket: "bucket-xyz" };

async function getAllS3Files() {
  const totalFiles = [];
  for await (const data of paginateListObjectsV2({ client }, s3Opts)) {
    totalFiles.push(...(data.Contents ?? []));
  }
  return totalFiles;
}

Run Code Online (Sandbox Code Playgroud)


nab*_*nab 8

实际上,aws2js通过s3.get()方法调用支持在低级别的桶中列出对象.要做到这一点,必须传递Amazon S3 REST API页面prefix上记录的参数:

var s3 = require('aws2js').load('s3', awsAccessKeyId, awsSecretAccessKey);    
s3.setBucket(bucketName);

var folder = encodeURI('some/path/to/S3/folder');
var url = '?prefix=' + folder;

s3.get(url, 'xml', function (error, data) {
    console.log(error);
    console.log(data);
});
Run Code Online (Sandbox Code Playgroud)

data上面代码段中的变量包含bucketName存储桶中所有对象的列表.

  • 虽然这被设置为正确/选择的答案,但应该注意https://github.com/SaltwaterC/aws2js已被弃用.在npm安装时,它会通知一个"aws2js已被弃用.请使用aws-sdk." (2认同)

hur*_*lad 5

当我找不到一个好的现有解决方案时,发布了knox-copy.将Rest API的所有分页细节包含在熟悉的节点流中:

var knoxCopy = require('knox-copy');

var client = knoxCopy.createClient({
  key: '<api-key-here>',
  secret: '<secret-here>',
  bucket: 'mrbucket'
});

client.streamKeys({
  // omit the prefix to list the whole bucket
  prefix: 'buckets/of/fun' 
}).on('data', function(key) {
  console.log(key);
});
Run Code Online (Sandbox Code Playgroud)

如果您列出的文件少于1000个,则单个页面将起作用:

client.listPageOfKeys({
  prefix: 'smaller/bucket/o/fun'
}, function(err, page) {
  console.log(page.Contents); // <- Here's your list of files
});
Run Code Online (Sandbox Code Playgroud)