我试图通过修改我的脚本在 R 中进行并行编程。在我的脚本上,我做了两个并行编程。第一个已经完成,但第二个是错误的,而脚本结构是相同的。下面是我的代码:
library(rvest)
library(RMySQL)
library(curl)
library(gdata)
library(doMC)
library(foreach)
library(doParallel)
library(raster)
trim <- function (x) gsub("^\\s+|\\s+$", "", x)
setwd('/home/chandra/R/IlmuOne/MisterAladin')
no_cores <- detectCores()
cl<-makeCluster(no_cores)
registerDoParallel(cl)
MasterData = read.xls("Master Hotels - FINAL.xlsx", sheet = 1, header = TRUE)
MasterData$url_agoda = as.character(MasterData$url_agoda)
today = as.Date(format(Sys.time(), "%Y-%m-%d"))+2
ntasks <- nrow(MasterData)
#This section perfomed well
foreach(i=1:ntasks) %dopar% {
url = MasterData$url_agoda[i]
if (trim(url)!='-' & trim(url)!='')
{
from = gregexpr(pattern ='=',url)[[1]][1]
piece1 = substr(url,1,from)
from = gregexpr(pattern ='&los=',url)[[1]][1]
piece2 = substr(url,from,nchar(url))
MasterData$url_agoda[i] = paste0(piece1,today,piece2)
}
}
con <- dbConnect(RMySQL::MySQL(), username = "root", password = "master",host = "localhost", dbname = "mister_aladin")
#Tried first 10 data
#Below section was error and always return error: Error in { : task 1 failed - "could not find function "%>%""
foreach(a=1:10, .packages='foreach') %dopar% {
hotel_id = MasterData$id[a]
vendor = 'Agoda'
url = MasterData$url_agoda[a]
if (url!='-')
{
tryCatch({
hotel <- curl(url) %>%
read_html() %>%
html_nodes(xpath='//*[@id="room-grouping"]') %>%
html_table(fill = TRUE)
hotel <- hotel[[1]]
hotel$hotel_id= hotel_id
hotel$vendor= vendor
colnames(hotel)[1] = 'TheSpace'
colnames(hotel)[4] = 'PricePerNight'
room = '-'
hotel$NormalPrice = 0
hotel$FinalPrice = 0
for(i in 1:nrow(hotel))
{
if (i==1 | (!grepl('See photos',hotel$TheSpace[i]) & hotel$TheSpace[i]!='') )
{
room = hotel$TheSpace[i]
}
hotel$TheSpace[i] = room
#Normal Price
if (gregexpr(pattern ='IDR',hotel$PricePerNight[i])[[1]][1][1]==1)
{
split = strsplit(hotel$PricePerNight[i],'\n')[[1]]
NormalPrice = trim(split[2])
hotel$NormalPrice[i] = NormalPrice
NormalPrice = as.integer(gsub(",","",NormalPrice))
hotel$NormalPrice[i] = NormalPrice
}
#Final Price
if (gregexpr(pattern ='IDR',hotel$PricePerNight[i])[[1]][1][1]==1)
{
split = strsplit(hotel$PricePerNight[i],'\n')[[1]]
FinalPrice = trim(split[6])
hotel$FinalPrice[i] = FinalPrice
FinalPrice = as.integer(gsub(",","",FinalPrice))
hotel$FinalPrice[i] = FinalPrice
}
hotel$NormalPrice[is.na(hotel$NormalPrice)] <- 0
hotel$FinalPrice[is.na(hotel$FinalPrice)] <- 0
}
hotel = hotel[which(hotel$FinalPrice!=0),c("TheSpace","NormalPrice","FinalPrice")]
colnames(hotel) = c('room','normal_price','final_price')
hotel$log = format(Sys.time(), "%Y-%m-%d %H:%M:%S")
hotel$hotel_id = hotel_id
hotel$vendor = vendor
Push = hotel[,c('hotel_id','room','normal_price','final_price','vendor','log')]
#print(paste0('Agoda: push one record, hotel id ',hotel_id,'!'))
#cat(paste(paste0('Agoda: push one record, hotel id ',hotel_id,'!'),'\n'))
dbWriteTable(conn=con,name='prices_',value=as.data.frame(Push), append = TRUE, row.names = F)
},
error = function(e) {
Sys.sleep(2)
e
})
}
}
dbDisconnect(con)
stopImplicitCluster()
Run Code Online (Sandbox Code Playgroud)
每次我运行脚本时,它总是给我错误:{中的错误:任务 1 失败-“找不到函数“%>%””
我已经检查了这个论坛上的每一个帖子,并试图应用它,但没有一个工作。
请建议任何解决方案
您必须使用.packages = c("magrittr", ...)并包含所有包,这些包是在foreach循环中运行代码所必需的。然而,.packages = "foreach"并没有帮助。
看,您可以想象您定义的所有包.packages都在每个并行工作程序中转发/加载。
运营商%>%需要包裹magrittr。然而,在这种情况下,在脚本开头加载它是不够的 - 需要为每个节点加载它。您可以将此行添加到集群的创建中来完成此操作:
cl<-makeCluster(no_cores)
registerDoParallel(cl)
clusterCall(cl, function() library(magrittr))
Run Code Online (Sandbox Code Playgroud)
| 归档时间: |
|
| 查看次数: |
5695 次 |
| 最近记录: |