我想通过或或或提取Create Statements我的50个MySQL数据库中的所有数据库.因此,每个DB都有一些表格SHOW CREATE TABLE db.tableSHOW CREATE TABLE db1.mytableSHOW CREATE TABLE db2.sometableSHOW CREATE TABLE db3.mytable1db1(table,mytable...) db2(table1,sometable) and so on
通过示例查询来说明DB:
SELECT *
FROM db.table1 m
LEFT JOIN db1.sometable o ON m.id = o.id
LEFT JOIN db2.sometables t ON p.id=t.id
LEFT JOIN db3.sometable s ON s.column='john'
library(RMySQL)
library(DBI)
con <- dbConnect(RMySQL::MySQL(),
username = "",
password = "",
host = "",
port = 3306,
dbname= mydbname)# when using dbs<-dbGetQuery(con ,"SHOW DATABASES") I have to …Run Code Online (Sandbox Code Playgroud) 我正在尝试通过JDBC连接到Amazon Athena并pool:
到目前为止有效:
library(RJDBC)
library(DBI)
library(pool)
library(dplyr)
library(dbplyr)
drv <- RJDBC::JDBC('com.amazonaws.athena.jdbc.AthenaDriver', '/opt/jdbc/AthenaJDBC41-1.1.0.jar')
pool_instance <- dbPool(
drv = drv,
url = "jdbc:awsathena://athena.us-west-2.amazonaws.com:443/",
user = "me",
s3_staging_dir = "s3://somedir",
password = "pwd"
)
mydata <- DBI::dbGetQuery(pool_instance, "SELECT *
FROM myDB.myTable
LIMIT 10")
mydata
Run Code Online (Sandbox Code Playgroud)
--->工作正常.正确的数据正在返回.
这不起作用:
pool_instance %>% tbl("myDB.myTable") %>% head(10)
# Error in .verify.JDBC.result(r, "Unable to retrieve JDBC result set for ", :
# Unable to retrieve JDBC result set for SELECT *
# FROM "myDB.myTable" AS "zzz2"
# WHERE …Run Code Online (Sandbox Code Playgroud) 我想对面板数据进行分区并保留数据的面板性质:
library(caret)
library(mlbench)
#example panel data where id is the persons identifier over years
data <- read.table("http://people.stern.nyu.edu/wgreene/Econometrics/healthcare.csv",
header=TRUE, sep=",", na.strings="NA", dec=".", strip.white=TRUE)
## Here for instance the dependent variable is working
inTrain <- createDataPartition(y = data$WORKING, p = .75,list = FALSE)
# subset into training
training <- data[ inTrain,]
# subset into testing
testing <- data[-inTrain,]
# Here we see some intersections of identifiers
str(training$id[10:20])
str(testing$id)
Run Code Online (Sandbox Code Playgroud)
但是我想,在对数据进行分区或采样时,避免将同一个人(id)分成两个数据集。他们是一种从数据中随机采样/分区并将个体分配给相应分区而不是观察的方法?
我试图采样:
mysample <- data[sample(unique(data$id), 1000,replace=FALSE),]
Run Code Online (Sandbox Code Playgroud)
然而,这破坏了数据的面板性质......
我最近刚做从STATA于R的改变,并有一些麻烦实施将R等价的命令,STATA xtlogit,fe or re和predict.我可以请求一些帮助来调整以下场景:
data <- read.table("http://people.stern.nyu.edu/wgreene/Econometrics/healthcare.csv",header=TRUE, sep=",", na.strings="NA", dec=".", strip.white=TRUE)
require(caret) # for confusionMatrix
#### subset into test & train according to the panel nature (split individuals rather then observations)
nID <- length(unique(data$id))
p = 0.50# partition
inTrain <- sample(unique(data$id), round(nID * p), replace=FALSE)
training <- data[data$id %in% inTrain, ]
testing <- data[!data$id %in% inTrain, ]
pooled <- glm(WORKING~WHITEC+FEMALE+BLUEC+HHNINC+AGE+AGESQ+EDUC+DOCVIS,data=training, family=binomial(link="logit"))
prediction.working= round(predict(pooled,newdata=testing,type="response"))
confusionMatrix(prediction.working,testing$WORKING) # Accuracy between both
Run Code Online (Sandbox Code Playgroud)
另外,我想对随机效果和固定效果做这些程序.所以我首先尝试了随机效果:
library(glmmML)
RE <- glmmML(WORKING~WHITEC+FEMALE+BLUEC+HHNINC+AGE+AGESQ+EDUC+DOCVIS, family=binomial(link="logit"), data=training, cluster=id, method="ghq", …Run Code Online (Sandbox Code Playgroud) 我想使用R匹配一些特定的字符串,并且只保留匹配的上面的行,这里是一些示例数据.拥有包含数百个类似案例的文件:
first_case<- data.frame(line =
c("#John Wayne: Su, 11.01.2013 08:24:42#
He is present / I guess, Does great job
--------------------------------------------------
#Michal Thorn: Fr, 12.09.2015 17:23:01#
Works quite frequently with people
--------------------------------------------------
#Sandra Nunes: Mo, 20.05.2011 09:00:29#
She has some new clients"))
second_case<- data.frame(line =
c("#Boris Jonson: Mo, 30.09.2017 09:20:42#
He is present
--------------------------------------------------
#Jacky Fine: Th, 02.02.2013 18:23:01#
Does great job
--------------------------------------------------
#Michael Bissping: Mo, 25.03.2012 10:00:29#
Hard to count on"))
third_case<- data.frame(line =
c("#Isabelle Warren: Sa, 02.12.2013 02:24:42#
Not around …Run Code Online (Sandbox Code Playgroud) 我的问题非常笼统,但是制作一个可重复的例子却有些困难.当运行一些长R脚本时,我有时会得到不合理的错误.由于脚本及其背后的数据库的长度,查找错误通常很困难且耗时.有没有办法以某种方式将错误导出到文本文件?
我想用一些假期来提取library(timeDate).我首先使用了以下语法:
EasterSunday<- as.Date(EasterSunday(2015:2018))
EasterSunday
# [1] "2015-04-05" "2016-03-27" "2017-04-16" "2018-04-01"
Run Code Online (Sandbox Code Playgroud)
然后我想添加相应的假日日期序列:
EasterSunday<- cbind.data.frame(hd=rep('EasterSunday',length(as.Date(EasterSunday(2015:2018)))),date=as.Date(EasterSunday(2015:2018)))
EasterSunday
#hd #date
#1 EasterSunday 2015-04-05
#2 EasterSunday 2016-03-27
#3 EasterSunday 2017-04-16
#4 EasterSunday 2018-04-01
Run Code Online (Sandbox Code Playgroud)
然后我想在该包中循环所有假期:
holidays=c("GoodFriday","EasterSunday","EasterMonday")
# Here I could not find the appropriate function
do.call(cbind, lapply(holidays, function(x) EasterSunday((2015:2018))))
#[,1] [,2] [,3]
#[1,] ? ? ?
Run Code Online (Sandbox Code Playgroud) 我在表单中提取ID时遇到一些困难:
27da12ce-85fe-3f28-92f9-e5235a5cf6ac
Run Code Online (Sandbox Code Playgroud)
来自数据框:
a<-c("NAME_27da12ce-85fe-3f28-92f9-e5235a5cf6ac_THOMAS_MYR",
"NAME_94773a8c-b71d-3be6-b57e-db9d8740bb98_THIMO",
"NAME_1ed571b4-1aef-3fe2-8f85-b757da2436ee_ALEX",
"NAME_9fbeda37-0e4f-37aa-86ef-11f907812397_JOHN_TYA",
"NAME_83ef784f-3128-35a1-8ff9-daab1c5f944b_BISHOP",
"NAME_39de28ca-5eca-3e6c-b5ea-5b82784cc6f4_DUE_TO",
"NAME_0a52a024-9305-3bf1-a0a6-84b009cc5af4_WIS_MICHAL",
"NAME_2520ebbb-7900-32c9-9f2d-178cf04f7efc_Sarah_Lu_Van_Gar/Thomas")
Run Code Online (Sandbox Code Playgroud)
基本上它是第一个和第二个下划线之间的东西.
通常我接近:
library(tidyr)
df$a<-as.character(df$a)
df<-df[grep("_", df$a), ]
df<- separate(df, a, c("ID","Name") , sep = "_")
df$a<-as.numeric(df$ID)
Run Code Online (Sandbox Code Playgroud)
然而这次有许多下划线......我的方法失败了.有没有办法提取该ID?