TFX*_*TFX 19 mapreduce mongodb
我一直在尝试在MongoDB中使用MapReduce来做我认为简单的过程.我不知道这是否是正确的方法,如果我甚至应该使用MapReduce.我用谷歌搜索了我想到的关键词并试图点击我认为我会取得最大成功的文档 - 但没有.也许我在想这个太难了?
我有两个系列:details和gpas
details由一大堆文件(300多万)组成.所述studentid元件可以被重复两次,每个year,如下所示:
{ "_id" : ObjectId("4d49b7yah5b6d8372v640100"), "classes" : [1,17,19,21], "studentid" : "12345a", "year" : 1}
{ "_id" : ObjectId("4d76b7oij7s2d8372v640100"), "classes" : [2,12,19,22], "studentid" : "98765a", "year" : 1}
{ "_id" : ObjectId("4d49b7oij7s2d8372v640100"), "classes" : [32,91,101,217], "studentid" : "12345a", "year" : 2}
{ "_id" : ObjectId("4d76b7rty7s2d8372v640100"), "classes" : [1,11,18,22], "studentid" : "24680a", "year" : 1}
{ "_id" : ObjectId("4d49b7oij7s2d8856v640100"), "classes" : [32,99,110,215], "studentid" : "98765a", "year" : 2}
...
Run Code Online (Sandbox Code Playgroud)
gpas具有相同studentid的元素details.每个只有一个条目studentid,如下所示:
{ "_id" : ObjectId("4d49b7yah5b6d8372v640111"), "studentid" : "12345a", "overall" : 97, "subscore": 1}
{ "_id" : ObjectId("4f76b7oij7s2d8372v640213"), "studentid" : "98765a", "overall" : 85, "subscore": 5}
{ "_id" : ObjectId("4j49b7oij7s2d8372v640871"), "studentid" : "24680a", "overall" : 76, "subscore": 2}
...
Run Code Online (Sandbox Code Playgroud)
最后,我希望以这种格式为每个学生提供一行集合:
{ "_id" : ObjectId("4d49b7yah5b6d8372v640111"), "studentid" : "12345a", "classes_1": [1,17,19,21], "classes_2": [32,91,101,217], "overall" : 97, "subscore": 1}
{ "_id" : ObjectId("4f76b7oij7s2d8372v640213"), "studentid" : "98765a", "classes_1": [2,12,19,22], "classes_2": [32,99,110,215], "overall" : 85, "subscore": 5}
{ "_id" : ObjectId("4j49b7oij7s2d8372v640871"), "studentid" : "24680a", "classes_1": [1,11,18,22], "classes_2": [], "overall" : 76, "subscore": 2}
...
Run Code Online (Sandbox Code Playgroud)
我打算这样做的方法是运行MapReduce,如下所示:
var mapDetails = function() {
emit(this.studentid, {studentid: this.studentid, classes: this.classes, year: this.year, overall: 0, subscore: 0});
};
var mapGpas = function() {
emit(this.studentid, {studentid: this.studentid, classes: [], year: 0, overall: this.overall, subscore: this.subscore});
};
var reduce = function(key, values) {
var outs = { studentid: "0", classes_1: [], classes_2: [], overall: 0, subscore: 0};
values.forEach(function(value) {
if (value.year == 0) {
outs.overall = value.overall;
outs.subscore = value.subscore;
}
else {
if (value.year == 1) {
outs.classes_1 = value.classes;
}
if (value.year == 2) {
outs.classes_2 = value.classes;
}
outs.studentid = value.studentid;
}
});
return outs;
};
res = db.details.mapReduce(mapDetails, reduce, {out: {reduce: 'joined'}})
res = db.gpas.mapReduce(mapGpas, reduce, {out: {reduce: 'joined'}})
Run Code Online (Sandbox Code Playgroud)
但是当我运行它时,这是我得到的集合:
{ "_id" : "12345a", "value" : { "studentid" : "12345a", "classes_1" : [ ], "classes_2" : [ ], "overall" : 97, "subscore" : 1 } }
{ "_id" : "98765a", "value" : { "studentid" : "98765a", "classes_1" : [ ], "classes_2" : [ ], "overall" : 85, "subscore" : 5 } }
{ "_id" : "24680a", "value" : { "studentid" : "24680a", "classes_1" : [ ], "classes_2" : [ ], "overall" : 76, "subscore" : 2 } }
Run Code Online (Sandbox Code Playgroud)
我错过了类数组.
另外,另外,如何访问生成的MapReduce value元素中的元素?MapReduce是否总是输出到value您命名的任何其他内容?
Mar*_*arc 43
这类似于MongoDB用户Google网上论坛上提出的问题.
https://groups.google.com/group/mongodb-user/browse_thread/thread/60a8b683e2626ada?pli=1
答案引用了一个类似于您的示例的在线教程:http: //tebros.com/2011/07/using-mongodb-mapreduce-to-join-2-collections/
有关MongoDB中MapReduce的更多信息,请参阅文档:http: //www.mongodb.org/display/DOCS/MapReduce
此外,还有一个有用的逐步演练,介绍MapReduce操作如何在MongoDB Cookbook文章"使用版本化文档查找最大值和最小值"的文章"Extras"部分中工作: http://cookbook.mongodb.组织/模式/ finding_max_and_min /
如果您已经阅读了一些参考文档,请原谅我.我已经将它们包含在其他用户的利益中,这些用户可能正在阅读这篇文章以及在MongoDB中使用MapReduce的新用户
重要的是Map函数中'''语句的输出与Reduce函数的输出相匹配.如果Map函数只输出一个文档,则可能根本不运行Reduce函数,然后输出集合将包含不匹配的文档.
我稍微修改了你的map语句,以你想要的输出格式发出文档,有两个独立的"类"数组.
我还重写了你的reduce语句,将新类添加到classes_1和classes_2数组中,只有它们尚不存在时才会这样.
var mapDetails = function(){
var output = {studentid: this.studentid, classes_1: [], classes_2: [], year: this.year, overall: 0, subscore: 0}
if (this.year == 1) {
output.classes_1 = this.classes;
}
if (this.year == 2) {
output.classes_2 = this.classes;
}
emit(this.studentid, output);
};
var mapGpas = function() {
emit(this.studentid, {studentid: this.studentid, classes_1: [], classes_2: [], year: 0, overall: this.overall, subscore: this.subscore});
};
var r = function(key, values) {
var outs = { studentid: "0", classes_1: [], classes_2: [], overall: 0, subscore: 0};
values.forEach(function(v){
outs.studentid = v.studentid;
v.classes_1.forEach(function(class){if(outs.classes_1.indexOf(class)==-1){outs.classes_1.push(class)}})
v.classes_2.forEach(function(class){if(outs.classes_2.indexOf(class)==-1){outs.classes_2.push(class)}})
if (v.year == 0) {
outs.overall = v.overall;
outs.subscore = v.subscore;
}
});
return outs;
};
res = db.details.mapReduce(mapDetails, r, {out: {reduce: 'joined'}})
res = db.gpas.mapReduce(mapGpas, r, {out: {reduce: 'joined'}})
Run Code Online (Sandbox Code Playgroud)
运行两个MapReduce操作会生成以下集合,该集合与您所需的格式匹配:
> db.joined.find()
{ "_id" : "12345a", "value" : { "studentid" : "12345a", "classes_1" : [ 1, 17, 19, 21 ], "classes_2" : [ 32, 91, 101, 217 ], "overall" : 97, "subscore" : 1 } }
{ "_id" : "24680a", "value" : { "studentid" : "24680a", "classes_1" : [ 1, 11, 18, 22 ], "classes_2" : [ ], "overall" : 76, "subscore" : 2 } }
{ "_id" : "98765a", "value" : { "studentid" : "98765a", "classes_1" : [ 2, 12, 19, 22 ], "classes_2" : [ 32, 99, 110, 215 ], "overall" : 85, "subscore" : 5 } }
>
Run Code Online (Sandbox Code Playgroud)
MapReduce始终以{_id:"id",值:"value"的形式输出文档.在标题为"Dot Notation(Reaching into Objects)"的文档中,有更多关于使用子文档的信息: http:/ /www.mongodb.org/display/DOCS/Dot+Notation+%28Reaching+into+Objects%29
如果您希望MapReduce的输出以不同的格式显示,则必须在应用程序中以编程方式执行此操作.
希望这将提高您对MapReduce的理解,并使您更接近于生成所需的输出集合.祝好运!
| 归档时间: |
|
| 查看次数: |
47010 次 |
| 最近记录: |