connect to the DBs
By default src_*
functions use a path to the cached
database file. You can alternatively pass in your own path if you’ve put
it somewhere else.
ITIS
src_itis <- src_itis()
TPL
src_tpl <- src_tpl()
COL
src_col <- src_col()
query with SQL syntax
sql_collect(src_itis, "select * from hierarchy limit 5")
#> # A tibble: 5 x 5
#> hierarchy_string tsn parent_tsn level childrencount
#> * <chr> <int> <int> <int> <int>
#> 1 202422 202422 0 0 154282
#> 2 202422-846491 846491 202422 1 2666
#> 3 202422-846491-660046 660046 846491 2 2654
#> 4 202422-846491-660046-846497 846497 660046 3 7
#> 5 202422-846491-660046-846497-846508 846508 846497 4 6
# or pipe the src to sql_collect
src_itis %>% sql_collect("select * from hierarchy limit 5")
use dplyr verbs
get a tbl
hiers <- src_itis %>% tbl("hierarchy")
#> # Source: table<hierarchy> [?? x 5]
#> # Database: postgres 9.6.0 [sacmac@localhost:5432/ITIS]
#> hierarchy_string tsn parent_tsn level childrencount
#> <chr> <int> <int> <int> <int>
#> 1 202422 202422 0 0 154282
#> 2 202422-846491 846491 202422 1 2666
#> 3 202422-846491-660046 660046 846491 2 2654
#> 4 202422-846491-660046-846497 846497 660046 3 7
#> 5 202422-846491-660046-846497-846508 846508 846497 4 6
#> 6 202422-846491-660046-846497-846508-846553 846553 846508 5 5
#> 7 202422-846491-660046-846497-846508-846553-954935 954935 846553 6 3
#> 8 202422-846491-660046-846497-846508-846553-954935-5549 5549 954935 7 2
#> 9 202422-846491-660046-846497-846508-846553-954935-5549-5550 5550 5549 8 0
#> 10 202422-846491-660046-846497-846508-846553-954936 954936 846553 6 0
#> # ... with more rows
select certain fields
hiers %>% select(TSN, level)
#> # Source: lazy query [?? x 2]
#> # Database: postgres 9.6.0 [sacmac@localhost:5432/ITIS]
#> tsn level
#> <int> <int>
#> 1 202422 0
#> 2 846491 1
#> 3 660046 2
#> 4 846497 3
#> 5 846508 4
#> 6 846553 5
#> 7 954935 6
#> 8 5549 7
#> 9 5550 8
#> 10 954936 6
#> # ... with more rows
Local versions of taxize
functions
A few of the key functions from taxize
have been ported
to taxizedb
. Support is currently limited to the NCBI
taxonomy database.
children
accesses the nodes immediately descending from
a given taxon
children(3701, db='ncbi')
#> $`3701`
#> childtaxa_id childtaxa_name childtaxa_rank
#> 1 1837063 Arabidopsis thaliana x Arabidopsis halleri species
#> 2 1547872 Arabidopsis umezawana species
#> 3 1328956 (Arabidopsis thaliana x Arabidopsis arenosa) x Arabidopsis suecica species
#> 4 1240361 Arabidopsis thaliana x Arabidopsis arenosa species
#> 5 869750 Arabidopsis thaliana x Arabidopsis lyrata species
#> 6 412662 Arabidopsis pedemontana species
#> 7 378006 Arabidopsis arenosa x Arabidopsis thaliana species
#> 8 347883 Arabidopsis arenicola species
#> 9 302551 Arabidopsis petrogena species
#> 10 97980 Arabidopsis croatica species
#> 11 97979 Arabidopsis cebennensis species
#> 12 81970 Arabidopsis halleri species
#> 13 59690 Arabidopsis kamchatica species
#> 14 59689 Arabidopsis lyrata species
#> 15 45251 Arabidopsis neglecta species
#> 16 45249 Arabidopsis suecica species
#> 17 38785 Arabidopsis arenosa species
#> 18 3702 Arabidopsis thaliana species
#>
#> attr(,"class")
#> [1] "children"
#> attr(,"db")
#> [1] "ncbi"
classification
finds the lineage of a taxon
classification(3702, db='ncbi')
#> $`3702`
#> name rank id
#> 1 cellular organisms no rank 131567
#> 2 Eukaryota superkingdom 2759
#> 3 Viridiplantae kingdom 33090
#> 4 Streptophyta phylum 35493
#> 5 Streptophytina subphylum 131221
#> 6 Embryophyta no rank 3193
#> 7 Tracheophyta no rank 58023
#> 8 Euphyllophyta no rank 78536
#> 9 Spermatophyta no rank 58024
#> 10 Magnoliophyta no rank 3398
#> 11 Mesangiospermae no rank 1437183
#> 12 eudicotyledons no rank 71240
#> 13 Gunneridae no rank 91827
#> 14 Pentapetalae no rank 1437201
#> 15 rosids subclass 71275
#> 16 malvids no rank 91836
#> 17 Brassicales order 3699
#> 18 Brassicaceae family 3700
#> 19 Camelineae tribe 980083
#> 20 Arabidopsis genus 3701
#> 21 Arabidopsis thaliana species 3702
#>
#> attr(,"class")
#> [1] "classification"
#> attr(,"db")
#> [1] "ncbi"
downstream
finds all taxa descending from a taxon
downstream(3700, db='ncbi')
#> $`3700`
#> childtaxa_id childtaxa_name rank
#> 1 2071891 Draba taylorii species
#> 2 2071524 Rorippa tenerrima species
#> 3 2071523 Rorippa crystallina species
#> 4 2071509 Physaria calderi species
#> 5 2071468 Erysimum arenicola species
#> 6 2071452 Draba yukonensis species
#> 7 2071451 Draba thompsonii species
#> ...
#> 326 1492251 Erysimum lilacinum species
#> 327 1492250 Erysimum leucanthemum species
#> 328 1492249 Erysimum leptostylum species
#> 329 1492248 Erysimum leptophyllum species
#> 330 1492247 Erysimum leptocarpum species
#> 331 1492246 Erysimum ledebourii species
#> 332 1492245 Erysimum laxiflorum species
#> 333 1492244 Erysimum kurdicum species
#> [ reached getOption("max.print") -- omitted 2880 rows ]
#>
#> attr(,"class")
#> [1] "downstream"
#> attr(,"db")
#> [1] "ncbi"
All of these functions run very fast. It only takes a few seconds to find all bacterial taxa and count them:
downstream(2, db='ncbi')[[1]] %>%
dplyr::group_by(rank) %>%
dplyr::count()
#> #> [1] 138695
#> # A tibble: 18 x 2
#> # Groups: rank [18]
#> rank n
#> <chr> <int>
#> 1 class 83
#> 2 family 483
#> 3 forma 4
#> 4 genus 3497
#> 5 no rank 37140
#> 6 order 198
#> 7 phylum 134
#> 8 species 97031
#> 9 species group 68
#> 10 species subgroup 10
#> 11 subclass 3
#> 12 subfamily 1
#> 13 subgenus 1
#> 14 suborder 8
#> 15 subphylum 1
#> 16 subspecies 10
#> 17 tribe 2
#> 18 varietas 21
Mapping functions
Several mapping functions are available for the NCBI taxonomy database:
# Map scientific or common names to taxonomy IDs
name2taxid("pig")
#> [1] "9823"
# Map taxonomy IDs to scientific names
taxid2name(9823)
#> [1] "Sus scrofa"
# Map taxonomy IDs to rank
taxid2rank(2)
#> [1] "superkingdom"