R XML - 将父节点和子节点组合成数据框架

Tam*_*boy 2 xml xpath r

我有这样的xml:

<root>
<cards>
<meeting name="Punchestown (IRE)" id="195" diffusion_course_name="PUNCHESTOWN">
      <race id="692415" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
        <time>12:25</time>
        <date>2018-01-13</date>
        <ampm>pm</ampm>
        <title>Adare Manor Opportunity Handicap Chase</title>
        <type>C</type>
        <distance>2m4f</distance>
        <group>Handicap</group>
        <tipsAllowed>1</tipsAllowed>
        <predictorAllowed>1</predictorAllowed>
        <bettingLink>1</bettingLink>
        <declaredRunners>10</declaredRunners>
        <liveCommentary>1</liveCommentary>
        <liveTab>1</liveTab>
        <raceDescription>Handicap Chase</raceDescription>
        <tvText>ATR </tvText>
      </race>
      <race id="692416" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
        <time>1:00</time>
        <date>2018-01-13</date>
        <ampm>pm</ampm>
        <title>Total Event Rental (Kildare) Novice Chase (Grade 3)</title>
        <type>C</type>
        <distance>2m4f</distance>
        <group>Grade 3</group>
        <tipsAllowed>1</tipsAllowed>
        <predictorAllowed>1</predictorAllowed>
        <bettingLink>1</bettingLink>
        <declaredRunners>7</declaredRunners>
        <liveCommentary>1</liveCommentary>
        <liveTab>1</liveTab>
        <raceDescription>Novice Chase Grade 3</raceDescription>
        <tvText>ATR </tvText>
      </race>
      <race id="692417" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
        <time>1:35</time>
        <date>2018-01-13</date>
        <ampm>pm</ampm>
        <title>Connolly's RED MILLS Amateur National (Q.R.) Handicap Chase</title>
        <type>C</type>
        <distance>3m1f</distance>
        <group>Handicap</group>
        <tipsAllowed>1</tipsAllowed>
        <predictorAllowed>1</predictorAllowed>
        <bettingLink>1</bettingLink>
        <declaredRunners>12</declaredRunners>
        <liveCommentary>1</liveCommentary>
        <liveTab>1</liveTab>
        <raceDescription>Handicap Chase</raceDescription>
        <tvText>ATR </tvText>
      </race>
      <race id="692418" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
        <time>2:10</time>
        <date>2018-01-13</date>
        <ampm>pm</ampm>
        <title>Sky Bet Moscow Flyer Novice Hurdle (Grade 2)</title>
        <type>H</type>
        <distance>2m</distance>
        <group>Grade 2</group>
        <tipsAllowed>1</tipsAllowed>
        <predictorAllowed>1</predictorAllowed>
        <bettingLink>1</bettingLink>
        <declaredRunners>7</declaredRunners>
        <liveCommentary>1</liveCommentary>
        <liveTab>1</liveTab>
        <raceDescription>Novice Hurdle Grade 2</raceDescription>
        <tvText>ATR </tvText>
      </race>
      <race id="692419" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
        <time>2:45</time>
        <date>2018-01-13</date>
        <ampm>pm</ampm>
        <title>Sportinglife.com Maiden Hurdle</title>
        <type>H</type>
        <distance>2m</distance>
        <group/>
        <tipsAllowed>1</tipsAllowed>
        <predictorAllowed>1</predictorAllowed>
        <bettingLink>1</bettingLink>
        <declaredRunners>17</declaredRunners>
        <liveCommentary>1</liveCommentary>
        <liveTab>1</liveTab>
        <raceDescription>Maiden Hurdle</raceDescription>
        <tvText>ATR </tvText>
      </race>
      <race id="692420" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
        <time>3:20</time>
        <date>2018-01-13</date>
        <ampm>pm</ampm>
        <title>Leinster Leader Mares Handicap Hurdle</title>
        <type>H</type>
        <distance>2m4f40y</distance>
        <group>Handicap</group>
        <tipsAllowed>1</tipsAllowed>
        <predictorAllowed>1</predictorAllowed>
        <bettingLink>1</bettingLink>
        <declaredRunners>8</declaredRunners>
        <liveCommentary>1</liveCommentary>
        <liveTab>1</liveTab>
        <raceDescription>Handicap Hurdle</raceDescription>
        <tvText>ATR </tvText>
      </race>
      <race id="692421" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
        <time>3:50</time>
        <date>2018-01-13</date>
        <ampm>pm</ampm>
        <title>David Trundley Artist At Punchestown Irish Stallion Farms EBF Mares Flat Race</title>
        <type>B</type>
        <distance>2m</distance>
        <group/>
        <tipsAllowed>1</tipsAllowed>
        <predictorAllowed>1</predictorAllowed>
        <bettingLink>1</bettingLink>
        <declaredRunners>14</declaredRunners>
        <liveCommentary>1</liveCommentary>
        <liveTab>1</liveTab>
        <raceDescription>NHF</raceDescription>
        <tvText>ATR </tvText>
      </race>
    </meeting>
    <meeting name="Warwick" id="85" diffusion_course_name="WARWICK">
      <race id="691061" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
        <time>12:40</time>
        <date>2018-01-13</date>
        <ampm>pm</ampm>
        <title>Betfred Supports Jack Berry House Novices' Handicap Hurdle</title>
        <type>H</type>
        <distance>2m</distance>
        <group>Handicap</group>
        <tipsAllowed>1</tipsAllowed>
        <predictorAllowed>1</predictorAllowed>
        <bettingLink>1</bettingLink>
        <declaredRunners>18</declaredRunners>
        <liveCommentary>1</liveCommentary>
        <liveTab>1</liveTab>
        <raceDescription>Class 4 Novice Handicap Hurdle</raceDescription>
        <tvText>RUK </tvText>
        <betOffers>
          <betOffer>WH</betOffer>
        </betOffers>
      </race>
      <race id="691060" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
        <time>1:15</time>
        <date>2018-01-13</date>
        <ampm>pm</ampm>
        <title>Betfred Mobile Edward Courage Cup Handicap Chase</title>
        <type>C</type>
        <distance>2m54y</distance>
        <group>Handicap</group>
        <tipsAllowed>1</tipsAllowed>
        <predictorAllowed>1</predictorAllowed>
        <bettingLink>1</bettingLink>
        <declaredRunners>7</declaredRunners>
        <liveCommentary>1</liveCommentary>
        <liveTab>1</liveTab>
        <raceDescription>Class 3 Handicap Chase</raceDescription>
        <tvText>RUK </tvText>
        <betOffers>
          <betOffer>LB</betOffer>
          <betOffer>WH</betOffer>
        </betOffers>
      </race>
      <race id="691058" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
        <time>1:50</time>
        <date>2018-01-13</date>
        <ampm>pm</ampm>
        <title>Betfred Home Of Goals Galore Hampton Novices' Chase (Listed Race)</title>
        <type>C</type>
        <distance>3m</distance>
        <group>Listed</group>
        <tipsAllowed>1</tipsAllowed>
        <predictorAllowed>1</predictorAllowed>
        <bettingLink>1</bettingLink>
        <declaredRunners>5</declaredRunners>
        <liveCommentary>1</liveCommentary>
        <liveTab>1</liveTab>
        <raceDescription>Class 1 Novice Chase Listed</raceDescription>
        <tvText>ITV4 </tvText>
        <betOffers>
          <betOffer>Coral</betOffer>
        </betOffers>
      </race>
      <race id="691059" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
        <time>2:25</time>
        <date>2018-01-13</date>
        <ampm>pm</ampm>
        <title>Pertemps Network Handicap Hurdle (Series Qualifier)</title>
        <type>H</type>
        <distance>3m1f</distance>
        <group>Handicap</group>
        <tipsAllowed>1</tipsAllowed>
        <predictorAllowed>1</predictorAllowed>
        <bettingLink>1</bettingLink>
        <declaredRunners>12</declaredRunners>
        <liveCommentary>1</liveCommentary>
        <liveTab>1</liveTab>
        <raceDescription>Class 2 Handicap Hurdle</raceDescription>
        <tvText>ITV4 </tvText>
        <betOffers>
          <betOffer>LB</betOffer>
          <betOffer>WH</betOffer>
          <betOffer>Coral</betOffer>
        </betOffers>
      </race>
      <race id="691057" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
        <time>3:00</time>
        <date>2018-01-13</date>
        <ampm>pm</ampm>
        <title>Ballymore Leamington Novices' Hurdle (Grade 2)</title>
        <type>H</type>
        <distance>2m5f</distance>
        <group>Grade 2</group>
        <tipsAllowed>1</tipsAllowed>
        <predictorAllowed>1</predictorAllowed>
        <bettingLink>1</bettingLink>
        <declaredRunners>6</declaredRunners>
        <liveCommentary>1</liveCommentary>
        <liveTab>1</liveTab>
        <raceDescription>Class 1 Novice Hurdle Grade 2</raceDescription>
        <tvText>ITV4 </tvText>
        <betOffers>
          <betOffer>WH</betOffer>
          <betOffer>Coral</betOffer>
        </betOffers>
      </race>
      <race id="691056" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
        <time>3:35</time>
        <date>2018-01-13</date>
        <ampm>pm</ampm>
        <title>Betfred Classic Handicap Chase (Grade 3)</title>
        <type>C</type>
        <distance>3m5f54y</distance>
        <group>Grade 3 Handicap</group>
        <tipsAllowed>1</tipsAllowed>
        <predictorAllowed>1</predictorAllowed>
        <bettingLink>1</bettingLink>
        <declaredRunners>15</declaredRunners>
        <liveCommentary>1</liveCommentary>
        <liveTab>1</liveTab>
        <raceDescription>Class 1 Handicap Chase Grade 3</raceDescription>
        <tvText>ITV4 </tvText>
        <betOffers>
          <betOffer>LB</betOffer>
          <betOffer>WH</betOffer>
          <betOffer>Coral</betOffer>
          <betOffer>PP</betOffer>
        </betOffers>
      </race>
      <race id="691062" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
        <time>4:05</time>
        <date>2018-01-13</date>
        <ampm>pm</ampm>
        <title>Betfred TV "Newcomers" Standard Open National Hunt Flat Race</title>
        <type>B</type>
        <distance>2m</distance>
        <group/>
        <tipsAllowed>1</tipsAllowed>
        <predictorAllowed>1</predictorAllowed>
        <bettingLink>1</bettingLink>
        <declaredRunners>9</declaredRunners>
        <liveCommentary>1</liveCommentary>
        <liveTab>1</liveTab>
        <raceDescription>Class 5 NHF</raceDescription>
        <tvText>RUK </tvText>
        <betOffers>
          <betOffer>WH</betOffer>
        </betOffers>
      </race>
    </meeting>
    <meeting name="Wetherby" id="87" diffusion_course_name="WETHERBY">
      <race id="691067" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
        <time>12:30</time>
        <date>2018-01-13</date>
        <ampm>pm</ampm>
        <title>Racing UK Jump To It Novices' Hurdle</title>
        <type>H</type>
        <distance>2m3f154y</distance>
        <group/>
        <tipsAllowed>1</tipsAllowed>
        <predictorAllowed>1</predictorAllowed>
        <bettingLink>1</bettingLink>
        <declaredRunners>9</declaredRunners>
        <liveCommentary>1</liveCommentary>
        <liveTab>1</liveTab>
        <raceDescription>Class 4 Novice Hurdle</raceDescription>
        <tvText>RUK </tvText>
      </race>
      <race id="691066" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
        <time>1:05</time>
        <date>2018-01-13</date>
        <ampm>pm</ampm>
        <title>Racing UK In Stunning HD "Confined" Novices' Chase</title>
        <type>C</type>
        <distance>2m3f85y</distance>
        <group/>
        <tipsAllowed>1</tipsAllowed>
        <predictorAllowed>1</predictorAllowed>
        <bettingLink>1</bettingLink>
        <declaredRunners>7</declaredRunners>
        <liveCommentary>1</liveCommentary>
        <liveTab>1</liveTab>
        <raceDescription>Class 4 Novice Chase</raceDescription>
        <tvText>RUK </tvText>
      </race>
      <race id="691068" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
        <time>1:40</time>
        <date>2018-01-13</date>
        <ampm>pm</ampm>
        <title>Bet At racinguk.com Handicap Hurdle</title>
        <type>H</type>
        <distance>2m</distance>
        <group>Handicap</group>
        <tipsAllowed>1</tipsAllowed>
        <predictorAllowed>1</predictorAllowed>
        <bettingLink>1</bettingLink>
        <declaredRunners>9</declaredRunners>
        <liveCommentary>1</liveCommentary>
        <liveTab>1</liveTab>
        <raceDescription>Class 4 Handicap Hurdle</raceDescription>
        <tvText>RUK </tvText>
      </race>
      <race id="691063" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
        <time>2:15</time>
        <date>2018-01-13</date>
        <ampm>pm</ampm>
        <title>totescoop6 Play Today Handicap Chase</title>
        <type>C</type>
        <distance>1m7f36y</distance>
        <group>Handicap</group>
        <tipsAllowed>1</tipsAllowed>
        <predictorAllowed>1</predictorAllowed>
        <bettingLink>1</bettingLink>
        <declaredRunners>5</declaredRunners>
        <liveCommentary>1</liveCommentary>
        <liveTab>1</liveTab>
        <raceDescription>Class 2 Handicap Chase</raceDescription>
        <tvText>RUK </tvText>
      </race>
      <race id="691064" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
        <time>2:50</time>
        <date>2018-01-13</date>
        <ampm>pm</ampm>
        <title>totescoop6 Results On totepoolliveinfo.com Handicap Hurdle</title>
        <type>H</type>
        <distance>2m3f154y</distance>
        <group>Handicap</group>
        <tipsAllowed>1</tipsAllowed>
        <predictorAllowed>1</predictorAllowed>
        <bettingLink>1</bettingLink>
        <declaredRunners>11</declaredRunners>
        <liveCommentary>1</liveCommentary>
        <liveTab>1</liveTab>
        <raceDescription>Class 3 Handicap Hurdle</raceDescription>
        <tvText>RUK </tvText>
      </race>
      <race id="691065" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
        <time>3:25</time>
        <date>2018-01-13</date>
        <ampm>pm</ampm>
        <title>Book Now For Medieval Day - 3rd February Handicap Chase (Northern Lights Middle Distance Series)</title>
        <type>C</type>
        <distance>2m3f85y</distance>
        <group>Handicap</group>
        <tipsAllowed>1</tipsAllowed>
        <predictorAllowed>1</predictorAllowed>
        <bettingLink>1</bettingLink>
        <declaredRunners>7</declaredRunners>
        <liveCommentary>1</liveCommentary>
        <liveTab>1</liveTab>
        <raceDescription>Class 4 Handicap Chase</raceDescription>
        <tvText>RUK </tvText>
        <betOffers>
          <betOffer>LB</betOffer>
        </betOffers>
      </race>
      <race id="691069" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
        <time>3:55</time>
        <date>2018-01-13</date>
        <ampm>pm</ampm>
        <title>Racing UK On Sky 432 Fillies' "Junior" Standard Open National Hunt Flat Race</title>
        <type>B</type>
        <distance>1m4f77y</distance>
        <group/>
        <tipsAllowed>1</tipsAllowed>
        <predictorAllowed>1</predictorAllowed>
        <bettingLink>1</bettingLink>
        <declaredRunners>8</declaredRunners>
        <liveCommentary>1</liveCommentary>
        <liv

ali*_*ire 5

这里有用于 XML 处理的 xml2 和用于修改的 tidyverse 选项。属性(xml_attrs返回命名字符向量)、节点名称和节点值可以读入可强制转换为数据帧的三元素列表:

\n\n
library(tidyverse)\nlibrary(xml2)\n\nx <- read_xml(\'races.xml\')\n\nraces <- x %>% \n    xml_find_all(\'//race\') %>% \n    map_dfr(~list(attrs = list(xml_attrs(.x)), \n                  variable = list(map(xml_children(.x), xml_name)), \n                  value = list(map(xml_children(.x), xml_text))))\n\nraces\n#> # A tibble: 29 x 3\n#>    attrs     variable    value      \n#>    <list>    <list>      <list>     \n#>  1 <chr [5]> <list [15]> <list [15]>\n#>  2 <chr [5]> <list [15]> <list [15]>\n#>  3 <chr [5]> <list [15]> <list [15]>\n#>  4 <chr [5]> <list [15]> <list [15]>\n#>  5 <chr [5]> <list [15]> <list [15]>\n#>  6 <chr [5]> <list [15]> <list [15]>\n#>  7 <chr [5]> <list [15]> <list [15]>\n#>  8 <chr [5]> <list [16]> <list [16]>\n#>  9 <chr [5]> <list [16]> <list [16]>\n#> 10 <chr [5]> <list [16]> <list [16]>\n#> # ... with 19 more rows\n
Run Code Online (Sandbox Code Playgroud)\n

反过来可以用大量 tidyr 来清理:

\n
races_tidy <- races %>% \n    mutate(attr_names = map(attrs, names)) %>% \n    unnest(attr_names, attrs, .drop = FALSE) %>% \n    spread(attr_names, attrs) %>% \n    unnest(variable, value) %>% \n    unnest(variable, value) %>% \n    spread(variable, value) %>% \n    type_convert()    # fix variable types\n
Run Code Online (Sandbox Code Playgroud)\n

这是可行的,但是解除嵌套和传播是脆弱的。不过,编写更健壮的方法实际上并不需要太多工作,因为您可以在取消嵌套之前排列列表列:

\n\n
races_tidy2 <- races %>% \n    mutate(attrs = map(attrs, ~as_tibble(as.list(.x))), \n           data = map2(variable, value, ~as_tibble(set_names(.y, .x)))) %>% \n    unnest(attrs, data, .drop = TRUE) %>% \n    type_convert()\n
Run Code Online (Sandbox Code Playgroud)\n

最直接的方法是在迭代节点时进行正确的重新排列。这是最简洁且可能最有效的方法,但正确编写它依赖于对数据结构的仔细操作,因此编写可行的代码可能需要更长的时间。

\n\n
races_tidy3 <- x %>% \n    xml_find_all(\'//race\') %>% \n    map_dfr(~flatten(c(xml_attrs(.x), \n                       map(xml_children(.x), \n                           ~set_names(as.list(xml_text(.x)), xml_name(.x)))))) %>%\n    type_convert()\n\nraces_tidy3\n#> # A tibble: 29 x 21\n#>        id perf\xe2\x80\xa6 perf\xe2\x80\xa6 deta\xe2\x80\xa6 race\xe2\x80\xa6 time  date       ampm  title type  dist\xe2\x80\xa6\n#>     <int> <chr> <chr> <int> <chr> <tim> <date>     <chr> <chr> <chr> <chr>\n#>  1 692415 <NA>  <NA>      1 R     12:25 2018-01-13 pm    Adar\xe2\x80\xa6 C     2m4f \n#>  2 692416 <NA>  <NA>      1 R     01:00 2018-01-13 pm    Tota\xe2\x80\xa6 C     2m4f \n#>  3 692417 <NA>  <NA>      1 R     01:35 2018-01-13 pm    Conn\xe2\x80\xa6 C     3m1f \n#>  4 692418 <NA>  <NA>      1 R     02:10 2018-01-13 pm    Sky \xe2\x80\xa6 H     2m   \n#>  5 692419 <NA>  <NA>      1 R     02:45 2018-01-13 pm    Spor\xe2\x80\xa6 H     2m   \n#>  6 692420 <NA>  <NA>      1 R     03:20 2018-01-13 pm    Lein\xe2\x80\xa6 H     2m4f\xe2\x80\xa6\n#>  7 692421 <NA>  <NA>      1 R     03:50 2018-01-13 pm    Davi\xe2\x80\xa6 B     2m   \n#>  8 691061 <NA>  <NA>      1 R     12:40 2018-01-13 pm    Betf\xe2\x80\xa6 H     2m   \n#>  9 691060 <NA>  <NA>      1 R     01:15 2018-01-13 pm    Betf\xe2\x80\xa6 C     2m54y\n#> 10 691058 <NA>  <NA>      1 R     01:50 2018-01-13 pm    Betf\xe2\x80\xa6 C     3m   \n#> # ... with 19 more rows, and 10 more variables: group <chr>, tipsAllowed\n#> #   <int>, predictorAllowed <int>, bettingLink <int>, declaredRunners\n#> #   <int>, liveCommentary <int>, liveTab <int>, raceDescription <chr>,\n#> #   tvText <chr>, betOffers <chr>\n
Run Code Online (Sandbox Code Playgroud)\n

尽管 的列顺序不同,但它们都返回相同的数据races_tidy

\n
all_equal(races_tidy, races_tidy2)\n#> [1] TRUE\n\nidentical(races_tidy2, races_tidy3)\n#> [1] TRUE\n
Run Code Online (Sandbox Code Playgroud)\n