Identifies higher level taxa for each sequence in clusters for given rank. Selects representative sequences for each unique taxon using the choose_by functions. By default, the function will choose the top ten sequences by first sorting by those with fewest number of ambiguous sequences, then by youngest, then by sequence length.
See also
Other tools-public:
calc_mad()
,
calc_wrdfrq()
,
drop_clstrs()
,
drop_sqs()
,
get_clstr_slot()
,
get_nsqs()
,
get_ntaxa()
,
get_sq_slot()
,
get_stage_times()
,
get_tx_slot()
,
get_txids()
,
is_txid_in_clstr()
,
is_txid_in_sq()
,
list_clstrrec_slots()
,
list_ncbi_ranks()
,
list_seqrec_slots()
,
list_taxrec_slots()
,
plot_phylota_pa()
,
plot_phylota_treemap()
,
read_phylota()
,
write_sqs()
Examples
data("dragonflies")
# For faster computations, let's only work with the 5 clusters.
dragonflies <- drop_clstrs(phylota = dragonflies, cid = dragonflies@cids[10:15])
# We can use drop_by_rank() to reduce to 10 sequences per genus for each cluster
(reduced_1 <- drop_by_rank(phylota = dragonflies, rnk = 'genus', n = 10,
choose_by = c('pambgs', 'age', 'nncltds'),
greatest = c(FALSE, FALSE, TRUE)))
#> Phylota Table (Anisoptera)
#> - [6] clusters
#> - [411] sequences
#> - [162] source taxa
# We can specify what aspects of the sequences we would like to select per genus
# By default we select the sequences with fewest ambiguous nucleotides (e.g.
# we avoid Ns), the youngest age and then longest sequence.
# We can reverse the 'greatest' to get the opposite.
(reduced_2 <- drop_by_rank(phylota = dragonflies, rnk = 'genus', n = 10,
choose_by = c('pambgs', 'age', 'nncltds'),
greatest = c(TRUE, TRUE, FALSE)))
#> Phylota Table (Anisoptera)
#> - [6] clusters
#> - [411] sequences
#> - [174] source taxa
# Leading to smaller sequnces ...
r1_sqlngth <- mean(get_sq_slot(phylota = reduced_1,
sid = reduced_1@sids, slt_nm = 'nncltds'))
r2_sqlngth <- mean(get_sq_slot(phylota = reduced_2,
sid = reduced_2@sids, slt_nm = 'nncltds'))
(r1_sqlngth > r2_sqlngth)
#> [1] FALSE
# ... with more ambigous characters ....
r1_pambgs <- mean(get_sq_slot(phylota = reduced_1, sid = reduced_1@sids,
slt_nm = 'pambgs'))
r2_pambgs <- mean(get_sq_slot(phylota = reduced_2, sid = reduced_2@sids,
slt_nm = 'pambgs'))
(r1_pambgs < r2_pambgs)
#> [1] TRUE
# .... and older ages (measured in days since being added to GenBank).
r1_age <- mean(get_sq_slot(phylota = reduced_1, sid = reduced_1@sids,
slt_nm = 'age'))
r2_age <- mean(get_sq_slot(phylota = reduced_2, sid = reduced_2@sids,
slt_nm = 'age'))
(r1_age < r2_age)
#> [1] FALSE
# Or... we can simply reduce the clusters to just one sequence per genus
(dragonflies <- drop_by_rank(phylota = dragonflies, rnk = 'genus', n = 1))
#> Phylota Table (Anisoptera)
#> - [6] clusters
#> - [73] sequences
#> - [72] source taxa