我想从我的hadoop流媒体作业中的文件中读取一个列表.这是我简单的mapper.py:
#!/usr/bin/env python
import sys
import json
def read_file():
id_list = []
#read ids from a file
f = open('../user_ids','r')
for line in f:
line = line.strip()
id_list.append(line)
return id_list
if __name__ == '__main__':
id_list = set(read_file())
# input comes from STDIN (standard input)
for line in sys.stdin:
# remove leading and trailing whitespace
line = line.strip()
line = json.loads(line)
user_id = line['user']['id']
if str(user_id) in id_list:
print '%s\t%s' % (user_id, line)
Run Code Online (Sandbox Code Playgroud)
这是我的reducer.py
#!/usr/bin/env python
from operator import itemgetter
import …Run Code Online (Sandbox Code Playgroud)