Separate all matching text into multiple rows
Examples
# doc <- pmc_xml("PMC2231364")
doc <- xml2::read_xml(system.file("extdata/PMC2231364.xml",
package = "tidypmc"))
txt <- pmc_text(doc)
separate_text(txt, "[ATCGN]{5,}")
#> # A tibble: 9 × 5
#> match section paragraph sentence text
#> <chr> <chr> <int> <int> <chr>
#> 1 ACGCAATCGTTTTCNT Results and Discussion; Comput… 2 3 A 16…
#> 2 AAACGTTTNCGT Results and Discussion; Comput… 2 4 It i…
#> 3 TGATAATGATTATCATTATCA Results and Discussion; Comput… 2 5 A 21…
#> 4 GATAATGATAATCATTATC Results and Discussion; Comput… 2 6 It i…
#> 5 TGANNNNNNTCAA Results and Discussion; Comput… 2 7 A 15…
#> 6 TTGATN Results and Discussion; Comput… 2 8 It i…
#> 7 NATCAA Results and Discussion; Comput… 2 8 It i…
#> 8 GTTAATTAA Results and Discussion; Comput… 3 4 The …
#> 9 GTTAATTAATGT Results and Discussion; Comput… 3 5 An A…
separate_text(txt, "\\([A-Z]{3,6}s?\\)")
#> # A tibble: 5 × 5
#> match section paragraph sentence text
#> <chr> <chr> <int> <int> <chr>
#> 1 (EMSA) Abstract 2 5 Seve…
#> 2 (PMNs) Background 1 8 Most…
#> 3 (SOM) Methods; Clustering analysis 1 4 For …
#> 4 (MEME) Methods; Discovery of regulatory DNA motifs 1 3 Coll…
#> 5 (IPTG) Methods; Gel mobility shift analysis of Fur b… 1 3 Expr…
# pattern can be a vector of words
separate_text(txt, c("hmu", "ybt", "yfe", "yfu"))
#> # A tibble: 4 × 5
#> match section paragraph sentence text
#> <chr> <chr> <int> <int> <chr>
#> 1 yfe Results and Discussion; Clustering analysis an… 3 4 Gene…
#> 2 hmu Results and Discussion; Clustering analysis an… 3 4 Gene…
#> 3 yfu Results and Discussion; Clustering analysis an… 3 4 Gene…
#> 4 ybt Results and Discussion; Clustering analysis an… 3 4 Gene…
# wrappers for separate_text with extra step to expand matched ranges
separate_refs(txt)
#> # A tibble: 93 × 6
#> id match section paragraph sentence text
#> <dbl> <chr> <chr> <int> <int> <chr>
#> 1 1 [1] Background 1 1 Yersinia pestis is the etiological…
#> 2 2 [2] Background 1 3 To produce a transmissible infecti…
#> 3 3 [3] Background 1 9 However, a few bacilli are taken u…
#> 4 4 [4,5] Background 1 10 Residence in this niche also facil…
#> 5 5 [4,5] Background 1 10 Residence in this niche also facil…
#> 6 6 [6] Background 2 1 A DNA microarray is able to determ…
#> 7 7 [7-9] Background 2 2 We and others have measured the ge…
#> 8 8 [7-9] Background 2 2 We and others have measured the ge…
#> 9 9 [7-9] Background 2 2 We and others have measured the ge…
#> 10 10 [10] Background 2 2 We and others have measured the ge…
#> # ℹ 83 more rows
separate_genes(txt)
#> # A tibble: 103 × 6
#> gene match section paragraph sentence text
#> <chr> <chr> <chr> <int> <int> <chr>
#> 1 purR PurR Abstract 2 5 Seve…
#> 2 phoP PhoP Background 2 3 We a…
#> 3 ompR OmpR Background 2 3 We a…
#> 4 oxyR OxyR Background 2 3 We a…
#> 5 csrA CsrA Results and Discussion 1 3 Afte…
#> 6 slyA SlyA Results and Discussion 1 3 Afte…
#> 7 phoPQ PhoPQ Results and Discussion 1 3 Afte…
#> 8 hmsH hmsHFRS Results and Discussion; Virulence gen… 3 3 For …
#> 9 hmsF hmsHFRS Results and Discussion; Virulence gen… 3 3 For …
#> 10 hmsR hmsHFRS Results and Discussion; Virulence gen… 3 3 For …
#> # ℹ 93 more rows
separate_tags(txt, "YPO")
#> # A tibble: 35 × 6
#> id match section paragraph sentence text
#> <chr> <chr> <chr> <int> <int> <chr>
#> 1 YPO1994 YPO1994-1996 Results and Discussion; Verifi… 1 4 For …
#> 2 YPO1995 YPO1994-1996 Results and Discussion; Verifi… 1 4 For …
#> 3 YPO1996 YPO1994-1996 Results and Discussion; Verifi… 1 4 For …
#> 4 YPO1087 YPO1087-1088 Results and Discussion; Verifi… 1 4 For …
#> 5 YPO1088 YPO1087-1088 Results and Discussion; Verifi… 1 4 For …
#> 6 YPO0881 YPO0881 Results and Discussion; Verifi… 1 5 Micr…
#> 7 YPO0882 YPO0882 Results and Discussion; Verifi… 1 5 Micr…
#> 8 YPO0883 YPO0883 Results and Discussion; Verifi… 1 5 Micr…
#> 9 YPO0884 YPO0884 Results and Discussion; Verifi… 1 5 Micr…
#> 10 YPO0881 YPO0881-0882 Results and Discussion; Verifi… 2 4 Howe…
#> # ℹ 25 more rows