The Search function
The main interface to searching documents in your Elasticsearch store
is the function Search()
. I nearly always develop R
software using all lowercase, but R has a function called
search()
, and I wanted to avoid collision with that
function.
Search()
is an interface to both the HTTP search API (in
which queries are passed in the URI of the request, meaning queries have
to be relatively simple), as well as the POST API, or the Query DSL, in
which queries are passed in the body of the request (so can be much more
complex).
There are a huge amount of ways you can search Elasticsearch documents - this tutorial covers some of them, and highlights the ways in which you interact with the R outputs.
x <- connect()
Search an index
out <- Search(x, index="shakespeare")
out$hits$total
#> $value
#> [1] 5000
#>
#> $relation
#> [1] "eq"
out$hits$hits[[1]]
#> $`_index`
#> [1] "shakespeare"
#>
#> $`_type`
#> [1] "_doc"
#>
#> $`_id`
#> [1] "0"
#>
#> $`_score`
#> [1] 1
#>
#> $`_source`
#> $`_source`$line_id
#> [1] 1
#>
#> $`_source`$play_name
#> [1] "Henry IV"
#>
#> $`_source`$line_number
#> [1] ""
#>
#> $`_source`$speaker
#> [1] ""
#>
#> $`_source`$text_entry
#> [1] "ACT I"
Search an index by type
Search(x, index = "shakespeare")$hits$hits[[1]]
#> $`_index`
#> [1] "shakespeare"
#>
#> $`_type`
#> [1] "_doc"
#>
#> $`_id`
#> [1] "0"
#>
#> $`_score`
#> [1] 1
#>
#> $`_source`
#> $`_source`$line_id
#> [1] 1
#>
#> $`_source`$play_name
#> [1] "Henry IV"
#>
#> $`_source`$line_number
#> [1] ""
#>
#> $`_source`$speaker
#> [1] ""
#>
#> $`_source`$text_entry
#> [1] "ACT I"
Return certain fields
Search(x, index = "shakespeare", body = '{
"_source": ["play_name", "speaker"]
}')$hits$hits[[1]]
#> $`_index`
#> [1] "shakespeare"
#>
#> $`_type`
#> [1] "_doc"
#>
#> $`_id`
#> [1] "0"
#>
#> $`_score`
#> [1] 1
#>
#> $`_source`
#> $`_source`$play_name
#> [1] "Henry IV"
#>
#> $`_source`$speaker
#> [1] ""
Paging
Search(x, index="shakespeare", size=1, from=1)$hits
#> $total
#> $total$value
#> [1] 5000
#>
#> $total$relation
#> [1] "eq"
#>
#>
#> $max_score
#> [1] 1
#>
#> $hits
#> $hits[[1]]
#> $hits[[1]]$`_index`
#> [1] "shakespeare"
#>
#> $hits[[1]]$`_type`
#> [1] "_doc"
#>
#> $hits[[1]]$`_id`
#> [1] "1"
#>
#> $hits[[1]]$`_score`
#> [1] 1
#>
#> $hits[[1]]$`_source`
#> $hits[[1]]$`_source`$line_id
#> [1] 2
#>
#> $hits[[1]]$`_source`$play_name
#> [1] "Henry IV"
#>
#> $hits[[1]]$`_source`$line_number
#> [1] ""
#>
#> $hits[[1]]$`_source`$speaker
#> [1] ""
#>
#> $hits[[1]]$`_source`$text_entry
#> [1] "SCENE I. London. The palace."
Queries
Using the q
parameter you can pass in a query, which
gets passed in the URI of the query. This type of query is less powerful
than the below query passed in the body of the request, using the
body
parameter.
Search(x, index="shakespeare", q="speaker:KING HENRY IV")$hits$total
#> $value
#> [1] 5000
#>
#> $relation
#> [1] "eq"
More complex queries
Here, query for values from 10 to 20 in the field
line_id
Search(x, index="shakespeare", q="line_id:[10 TO 20]")$hits$total
#> $value
#> [1] 11
#>
#> $relation
#> [1] "eq"
Get raw data
Search(x, index="shakespeare", raw=TRUE)
#> [1] "{\"took\":0,\"timed_out\":false,\"_shards\":{\"total\":1,\"successful\":1,\"skipped\":0,\"failed\":0},\"hits\":{\"total\":{\"value\":5000,\"relation\":\"eq\"},\"max_score\":1.0,\"hits\":[{\"_index\":\"shakespeare\",\"_type\":\"_doc\",\"_id\":\"0\",\"_score\":1.0,\"_source\":{\"line_id\":1,\"play_name\":\"Henry IV\",\"line_number\":\"\",\"speaker\":\"\",\"text_entry\":\"ACT I\"}},{\"_index\":\"shakespeare\",\"_type\":\"_doc\",\"_id\":\"1\",\"_score\":1.0,\"_source\":{\"line_id\":2,\"play_name\":\"Henry IV\",\"line_number\":\"\",\"speaker\":\"\",\"text_entry\":\"SCENE I. London. The palace.\"}},{\"_index\":\"shakespeare\",\"_type\":\"_doc\",\"_id\":\"2\",\"_score\":1.0,\"_source\":{\"line_id\":3,\"play_name\":\"Henry IV\",\"line_number\":\"\",\"speaker\":\"\",\"text_entry\":\"Enter KING HENRY, LORD JOHN OF LANCASTER, the EARL of WESTMORELAND, SIR WALTER BLUNT, and others\"}},{\"_index\":\"shakespeare\",\"_type\":\"_doc\",\"_id\":\"3\",\"_score\":1.0,\"_source\":{\"line_id\":4,\"play_name\":\"Henry IV\",\"speech_number\":1,\"line_number\":\"1.1.1\",\"speaker\":\"KING HENRY IV\",\"text_entry\":\"So shaken as we are, so wan with care,\"}},{\"_index\":\"shakespeare\",\"_type\":\"_doc\",\"_id\":\"4\",\"_score\":1.0,\"_source\":{\"line_id\":5,\"play_name\":\"Henry IV\",\"speech_number\":1,\"line_number\":\"1.1.2\",\"speaker\":\"KING HENRY IV\",\"text_entry\":\"Find we a time for frighted peace to pant,\"}},{\"_index\":\"shakespeare\",\"_type\":\"_doc\",\"_id\":\"5\",\"_score\":1.0,\"_source\":{\"line_id\":6,\"play_name\":\"Henry IV\",\"speech_number\":1,\"line_number\":\"1.1.3\",\"speaker\":\"KING HENRY IV\",\"text_entry\":\"And breathe short-winded accents of new broils\"}},{\"_index\":\"shakespeare\",\"_type\":\"_doc\",\"_id\":\"6\",\"_score\":1.0,\"_source\":{\"line_id\":7,\"play_name\":\"Henry IV\",\"speech_number\":1,\"line_number\":\"1.1.4\",\"speaker\":\"KING HENRY IV\",\"text_entry\":\"To be commenced in strands afar remote.\"}},{\"_index\":\"shakespeare\",\"_type\":\"_doc\",\"_id\":\"7\",\"_score\":1.0,\"_source\":{\"line_id\":8,\"play_name\":\"Henry IV\",\"speech_number\":1,\"line_number\":\"1.1.5\",\"speaker\":\"KING HENRY IV\",\"text_entry\":\"No more the thirsty entrance of this soil\"}},{\"_index\":\"shakespeare\",\"_type\":\"_doc\",\"_id\":\"8\",\"_score\":1.0,\"_source\":{\"line_id\":9,\"play_name\":\"Henry IV\",\"speech_number\":1,\"line_number\":\"1.1.6\",\"speaker\":\"KING HENRY IV\",\"text_entry\":\"Shall daub her lips with her own childrens blood;\"}},{\"_index\":\"shakespeare\",\"_type\":\"_doc\",\"_id\":\"9\",\"_score\":1.0,\"_source\":{\"line_id\":10,\"play_name\":\"Henry IV\",\"speech_number\":1,\"line_number\":\"1.1.7\",\"speaker\":\"KING HENRY IV\",\"text_entry\":\"Nor more shall trenching war channel her fields,\"}}]}}"
Curl debugging
Common options are verbose=TRUE
,
timeout_ms=1
, followlocation=TRUE
.
out <- Search(x, index="shakespeare", verbose = TRUE)
Query DSL searches - queries sent in the body of the request
Pass in as an R list
mapping_create(x, "shakespeare", update_all_types = TRUE, body = '{
"properties": {
"text_entry": {
"type": "text",
"fielddata": true
}
}
}')
#> $acknowledged
#> [1] TRUE
aggs <- list(aggs = list(stats = list(terms = list(field = "text_entry"))))
Search(x, index="shakespeare", body=aggs)$hits$hits[[1]]
#> $`_index`
#> [1] "shakespeare"
#>
#> $`_type`
#> [1] "_doc"
#>
#> $`_id`
#> [1] "0"
#>
#> $`_score`
#> [1] 1
#>
#> $`_source`
#> $`_source`$line_id
#> [1] 1
#>
#> $`_source`$play_name
#> [1] "Henry IV"
#>
#> $`_source`$line_number
#> [1] ""
#>
#> $`_source`$speaker
#> [1] ""
#>
#> $`_source`$text_entry
#> [1] "ACT I"
Or pass in as json query with newlines, easy to read
aggs <- '{
"aggs": {
"stats" : {
"terms" : {
"field" : "text_entry"
}
}
}
}'
Search(x, index="shakespeare", body=aggs)$hits$hits[[1]]
#> $`_index`
#> [1] "shakespeare"
#>
#> $`_type`
#> [1] "_doc"
#>
#> $`_id`
#> [1] "0"
#>
#> $`_score`
#> [1] 1
#>
#> $`_source`
#> $`_source`$line_id
#> [1] 1
#>
#> $`_source`$play_name
#> [1] "Henry IV"
#>
#> $`_source`$line_number
#> [1] ""
#>
#> $`_source`$speaker
#> [1] ""
#>
#> $`_source`$text_entry
#> [1] "ACT I"
Or pass in collapsed json string
aggs <- '{"aggs":{"stats":{"terms":{"field":"text_entry"}}}}'
Search(x, index="shakespeare", body=aggs)$hits$hits[[1]]
#> $`_index`
#> [1] "shakespeare"
#>
#> $`_type`
#> [1] "_doc"
#>
#> $`_id`
#> [1] "0"
#>
#> $`_score`
#> [1] 1
#>
#> $`_source`
#> $`_source`$line_id
#> [1] 1
#>
#> $`_source`$play_name
#> [1] "Henry IV"
#>
#> $`_source`$line_number
#> [1] ""
#>
#> $`_source`$speaker
#> [1] ""
#>
#> $`_source`$text_entry
#> [1] "ACT I"
Aggregations
Histograms
aggs <- '{
"aggs": {
"latbuckets" : {
"histogram" : {
"field" : "decimalLatitude",
"interval" : 5
}
}
}
}'
Search(x, index="gbif", body=aggs, size=0)$aggregations$latbuckets$buckets[1:3]
#> [[1]]
#> [[1]]$key
#> [1] -35
#>
#> [[1]]$doc_count
#> [1] 1
#>
#>
#> [[2]]
#> [[2]]$key
#> [1] -30
#>
#> [[2]]$doc_count
#> [1] 0
#>
#>
#> [[3]]
#> [[3]]$key
#> [1] -25
#>
#> [[3]]$doc_count
#> [1] 0
A bool query
mmatch <- '{
"query": {
"bool" : {
"must_not" : {
"range" : {
"speech_number" : {
"from" : 1, "to": 5
}}}}}}'
sapply(Search(x, index="shakespeare", body=mmatch)$hits$hits, function(x) x$`_source`$speech_number)
#> [[1]]
#> NULL
#>
#> [[2]]
#> NULL
#>
#> [[3]]
#> NULL
#>
#> [[4]]
#> [1] 6
#>
#> [[5]]
#> [1] 6
#>
#> [[6]]
#> [1] 7
#>
#> [[7]]
#> [1] 7
#>
#> [[8]]
#> [1] 7
#>
#> [[9]]
#> [1] 7
#>
#> [[10]]
#> [1] 7
Fuzzy query
Fuzzy query on numerics
fuzzy <- list(query = list(fuzzy = list(text_entry = "arms")))
Search(x, index="shakespeare", body = fuzzy)$hits$total
#> $value
#> [1] 49
#>
#> $relation
#> [1] "eq"
fuzzy <- list(query = list(fuzzy = list(text_entry = list(value = "arms", fuzziness = 4))))
Search(x, index="shakespeare", body=fuzzy)$hits$total
#> $value
#> [1] 618
#>
#> $relation
#> [1] "eq"
Range query
With numeric
body <- list(query=list(range=list(decimalLongitude=list(gte=1, lte=3))))
Search(x, 'gbif', body=body)$hits$total
#> $value
#> [1] 24
#>
#> $relation
#> [1] "eq"
body <- list(query=list(range=list(decimalLongitude=list(gte=2.9, lte=10))))
Search(x, 'gbif', body=body)$hits$total
#> $value
#> [1] 126
#>
#> $relation
#> [1] "eq"
With dates
body <- list(query=list(range=list(eventDate=list(gte="2012-01-01", lte="now"))))
Search(x, 'gbif', body=body)$hits$total
#> $value
#> [1] 301
#>
#> $relation
#> [1] "eq"
body <- list(query=list(range=list(eventDate=list(gte="2014-01-01", lte="now"))))
Search(x, 'gbif', body=body)$hits$total
#> $value
#> [1] 292
#>
#> $relation
#> [1] "eq"
More-like-this query (more_like_this can be shortened to mlt)
body <- '{
"query": {
"more_like_this": {
"fields": ["abstract","title"],
"like": "and then",
"min_term_freq": 1,
"max_query_terms": 12
}
}
}'
Search(x, 'plos', body=body)$hits$total
#> $value
#> [1] 488
#>
#> $relation
#> [1] "eq"
body <- '{
"query": {
"more_like_this": {
"fields": ["abstract","title"],
"like": "cell",
"min_term_freq": 1,
"max_query_terms": 12
}
}
}'
Search(x, 'plos', body=body)$hits$total
#> $value
#> [1] 58
#>
#> $relation
#> [1] "eq"
Highlighting
body <- '{
"query": {
"query_string": {
"query" : "cell"
}
},
"highlight": {
"fields": {
"title": {"number_of_fragments": 2}
}
}
}'
out <- Search(x, 'plos', body=body)
out$hits$total
#> $value
#> [1] 58
#>
#> $relation
#> [1] "eq"
sapply(out$hits$hits, function(x) x$highlight$title[[1]])[8:10]
#> [1] "Functional Analysis of the Drosophila Embryonic Germ <em>Cell</em> Transcriptome by RNA Interference"
#> [2] "Diversin Is Overexpressed in Breast Cancer and Accelerates <em>Cell</em> Proliferation and Invasion"
#> [3] "c-FLIP Protects Eosinophils from TNF-α-Mediated <em>Cell</em> Death In Vivo"
Scrolling search - instead of paging
Search(x, 'shakespeare', q="a*")$hits$total
#> $value
#> [1] 2747
#>
#> $relation
#> [1] "eq"
res <- Search(x, index = 'shakespeare', q="a*", time_scroll = "1m")
length(scroll(x, res$`_scroll_id`, time_scroll = "1m")$hits$hits)
#> [1] 10
res <- Search(x, index = 'shakespeare', q = "a*", time_scroll = "5m")
out <- res$hits$hits
hits <- 1
while (hits != 0) {
res <- scroll(x, res$`_scroll_id`)
hits <- length(res$hits$hits)
if (hits > 0)
out <- c(out, res$hits$hits)
}
length(out)
#> [1] 2747
res$hits$total
#> $value
#> [1] 2747
#>
#> $relation
#> [1] "eq"
Woohoo! Collected all 2747 documents in very little time.