Load elastic

The Search function

The main interface to searching documents in your Elasticsearch store is the function Search(). I nearly always develop R software using all lowercase, but R has a function called search(), and I wanted to avoid collision with that function.

Search() is an interface to both the HTTP search API (in which queries are passed in the URI of the request, meaning queries have to be relatively simple), as well as the POST API, or the Query DSL, in which queries are passed in the body of the request (so can be much more complex).

There are a huge amount of ways you can search Elasticsearch documents - this tutorial covers some of them, and highlights the ways in which you interact with the R outputs.

x <- connect()

Search an index

out <- Search(x, index="shakespeare")
out$hits$total
#> $value
#> [1] 5000
#> 
#> $relation
#> [1] "eq"
out$hits$hits[[1]]
#> $`_index`
#> [1] "shakespeare"
#> 
#> $`_type`
#> [1] "_doc"
#> 
#> $`_id`
#> [1] "0"
#> 
#> $`_score`
#> [1] 1
#> 
#> $`_source`
#> $`_source`$line_id
#> [1] 1
#> 
#> $`_source`$play_name
#> [1] "Henry IV"
#> 
#> $`_source`$line_number
#> [1] ""
#> 
#> $`_source`$speaker
#> [1] ""
#> 
#> $`_source`$text_entry
#> [1] "ACT I"

Search an index by type

Search(x, index = "shakespeare")$hits$hits[[1]]
#> $`_index`
#> [1] "shakespeare"
#> 
#> $`_type`
#> [1] "_doc"
#> 
#> $`_id`
#> [1] "0"
#> 
#> $`_score`
#> [1] 1
#> 
#> $`_source`
#> $`_source`$line_id
#> [1] 1
#> 
#> $`_source`$play_name
#> [1] "Henry IV"
#> 
#> $`_source`$line_number
#> [1] ""
#> 
#> $`_source`$speaker
#> [1] ""
#> 
#> $`_source`$text_entry
#> [1] "ACT I"

Return certain fields

Search(x, index = "shakespeare", body = '{
  "_source": ["play_name", "speaker"]
}')$hits$hits[[1]]
#> $`_index`
#> [1] "shakespeare"
#> 
#> $`_type`
#> [1] "_doc"
#> 
#> $`_id`
#> [1] "0"
#> 
#> $`_score`
#> [1] 1
#> 
#> $`_source`
#> $`_source`$play_name
#> [1] "Henry IV"
#> 
#> $`_source`$speaker
#> [1] ""

Paging

Search(x, index="shakespeare", size=1, from=1)$hits
#> $total
#> $total$value
#> [1] 5000
#> 
#> $total$relation
#> [1] "eq"
#> 
#> 
#> $max_score
#> [1] 1
#> 
#> $hits
#> $hits[[1]]
#> $hits[[1]]$`_index`
#> [1] "shakespeare"
#> 
#> $hits[[1]]$`_type`
#> [1] "_doc"
#> 
#> $hits[[1]]$`_id`
#> [1] "1"
#> 
#> $hits[[1]]$`_score`
#> [1] 1
#> 
#> $hits[[1]]$`_source`
#> $hits[[1]]$`_source`$line_id
#> [1] 2
#> 
#> $hits[[1]]$`_source`$play_name
#> [1] "Henry IV"
#> 
#> $hits[[1]]$`_source`$line_number
#> [1] ""
#> 
#> $hits[[1]]$`_source`$speaker
#> [1] ""
#> 
#> $hits[[1]]$`_source`$text_entry
#> [1] "SCENE I. London. The palace."

Queries

Using the q parameter you can pass in a query, which gets passed in the URI of the query. This type of query is less powerful than the below query passed in the body of the request, using the body parameter.

Search(x, index="shakespeare", q="speaker:KING HENRY IV")$hits$total
#> $value
#> [1] 5000
#> 
#> $relation
#> [1] "eq"

More complex queries

Here, query for values from 10 to 20 in the field line_id

Search(x, index="shakespeare", q="line_id:[10 TO 20]")$hits$total
#> $value
#> [1] 11
#> 
#> $relation
#> [1] "eq"

Get version number for each document

Version number usually is not returned.

sapply(Search(x, index="shakespeare", version=TRUE, size=2)$hits$hits, "[[", "_version")
#> [1] 1 1

Get raw data

Search(x, index="shakespeare", raw=TRUE)
#> [1] "{\"took\":0,\"timed_out\":false,\"_shards\":{\"total\":1,\"successful\":1,\"skipped\":0,\"failed\":0},\"hits\":{\"total\":{\"value\":5000,\"relation\":\"eq\"},\"max_score\":1.0,\"hits\":[{\"_index\":\"shakespeare\",\"_type\":\"_doc\",\"_id\":\"0\",\"_score\":1.0,\"_source\":{\"line_id\":1,\"play_name\":\"Henry IV\",\"line_number\":\"\",\"speaker\":\"\",\"text_entry\":\"ACT I\"}},{\"_index\":\"shakespeare\",\"_type\":\"_doc\",\"_id\":\"1\",\"_score\":1.0,\"_source\":{\"line_id\":2,\"play_name\":\"Henry IV\",\"line_number\":\"\",\"speaker\":\"\",\"text_entry\":\"SCENE I. London. The palace.\"}},{\"_index\":\"shakespeare\",\"_type\":\"_doc\",\"_id\":\"2\",\"_score\":1.0,\"_source\":{\"line_id\":3,\"play_name\":\"Henry IV\",\"line_number\":\"\",\"speaker\":\"\",\"text_entry\":\"Enter KING HENRY, LORD JOHN OF LANCASTER, the EARL of WESTMORELAND, SIR WALTER BLUNT, and others\"}},{\"_index\":\"shakespeare\",\"_type\":\"_doc\",\"_id\":\"3\",\"_score\":1.0,\"_source\":{\"line_id\":4,\"play_name\":\"Henry IV\",\"speech_number\":1,\"line_number\":\"1.1.1\",\"speaker\":\"KING HENRY IV\",\"text_entry\":\"So shaken as we are, so wan with care,\"}},{\"_index\":\"shakespeare\",\"_type\":\"_doc\",\"_id\":\"4\",\"_score\":1.0,\"_source\":{\"line_id\":5,\"play_name\":\"Henry IV\",\"speech_number\":1,\"line_number\":\"1.1.2\",\"speaker\":\"KING HENRY IV\",\"text_entry\":\"Find we a time for frighted peace to pant,\"}},{\"_index\":\"shakespeare\",\"_type\":\"_doc\",\"_id\":\"5\",\"_score\":1.0,\"_source\":{\"line_id\":6,\"play_name\":\"Henry IV\",\"speech_number\":1,\"line_number\":\"1.1.3\",\"speaker\":\"KING HENRY IV\",\"text_entry\":\"And breathe short-winded accents of new broils\"}},{\"_index\":\"shakespeare\",\"_type\":\"_doc\",\"_id\":\"6\",\"_score\":1.0,\"_source\":{\"line_id\":7,\"play_name\":\"Henry IV\",\"speech_number\":1,\"line_number\":\"1.1.4\",\"speaker\":\"KING HENRY IV\",\"text_entry\":\"To be commenced in strands afar remote.\"}},{\"_index\":\"shakespeare\",\"_type\":\"_doc\",\"_id\":\"7\",\"_score\":1.0,\"_source\":{\"line_id\":8,\"play_name\":\"Henry IV\",\"speech_number\":1,\"line_number\":\"1.1.5\",\"speaker\":\"KING HENRY IV\",\"text_entry\":\"No more the thirsty entrance of this soil\"}},{\"_index\":\"shakespeare\",\"_type\":\"_doc\",\"_id\":\"8\",\"_score\":1.0,\"_source\":{\"line_id\":9,\"play_name\":\"Henry IV\",\"speech_number\":1,\"line_number\":\"1.1.6\",\"speaker\":\"KING HENRY IV\",\"text_entry\":\"Shall daub her lips with her own childrens blood;\"}},{\"_index\":\"shakespeare\",\"_type\":\"_doc\",\"_id\":\"9\",\"_score\":1.0,\"_source\":{\"line_id\":10,\"play_name\":\"Henry IV\",\"speech_number\":1,\"line_number\":\"1.1.7\",\"speaker\":\"KING HENRY IV\",\"text_entry\":\"Nor more shall trenching war channel her fields,\"}}]}}"

Curl debugging

Common options are verbose=TRUE, timeout_ms=1, followlocation=TRUE.

out <- Search(x, index="shakespeare", verbose = TRUE)

Query DSL searches - queries sent in the body of the request

Pass in as an R list

mapping_create(x, "shakespeare", update_all_types = TRUE, body = '{
   "properties": {
     "text_entry": {
       "type":     "text",
       "fielddata": true
    }
  }
}')
#> $acknowledged
#> [1] TRUE
aggs <- list(aggs = list(stats = list(terms = list(field = "text_entry"))))
Search(x, index="shakespeare", body=aggs)$hits$hits[[1]]
#> $`_index`
#> [1] "shakespeare"
#> 
#> $`_type`
#> [1] "_doc"
#> 
#> $`_id`
#> [1] "0"
#> 
#> $`_score`
#> [1] 1
#> 
#> $`_source`
#> $`_source`$line_id
#> [1] 1
#> 
#> $`_source`$play_name
#> [1] "Henry IV"
#> 
#> $`_source`$line_number
#> [1] ""
#> 
#> $`_source`$speaker
#> [1] ""
#> 
#> $`_source`$text_entry
#> [1] "ACT I"

Or pass in as json query with newlines, easy to read

aggs <- '{
    "aggs": {
        "stats" : {
            "terms" : {
                "field" : "text_entry"
            }
        }
    }
}'
Search(x, index="shakespeare", body=aggs)$hits$hits[[1]]
#> $`_index`
#> [1] "shakespeare"
#> 
#> $`_type`
#> [1] "_doc"
#> 
#> $`_id`
#> [1] "0"
#> 
#> $`_score`
#> [1] 1
#> 
#> $`_source`
#> $`_source`$line_id
#> [1] 1
#> 
#> $`_source`$play_name
#> [1] "Henry IV"
#> 
#> $`_source`$line_number
#> [1] ""
#> 
#> $`_source`$speaker
#> [1] ""
#> 
#> $`_source`$text_entry
#> [1] "ACT I"

Or pass in collapsed json string

aggs <- '{"aggs":{"stats":{"terms":{"field":"text_entry"}}}}'
Search(x, index="shakespeare", body=aggs)$hits$hits[[1]]
#> $`_index`
#> [1] "shakespeare"
#> 
#> $`_type`
#> [1] "_doc"
#> 
#> $`_id`
#> [1] "0"
#> 
#> $`_score`
#> [1] 1
#> 
#> $`_source`
#> $`_source`$line_id
#> [1] 1
#> 
#> $`_source`$play_name
#> [1] "Henry IV"
#> 
#> $`_source`$line_number
#> [1] ""
#> 
#> $`_source`$speaker
#> [1] ""
#> 
#> $`_source`$text_entry
#> [1] "ACT I"

Aggregations

Histograms

aggs <- '{
    "aggs": {
        "latbuckets" : {
           "histogram" : {
               "field" : "decimalLatitude",
               "interval" : 5
           }
        }
    }
}'
Search(x, index="gbif", body=aggs, size=0)$aggregations$latbuckets$buckets[1:3]
#> [[1]]
#> [[1]]$key
#> [1] -35
#> 
#> [[1]]$doc_count
#> [1] 1
#> 
#> 
#> [[2]]
#> [[2]]$key
#> [1] -30
#> 
#> [[2]]$doc_count
#> [1] 0
#> 
#> 
#> [[3]]
#> [[3]]$key
#> [1] -25
#> 
#> [[3]]$doc_count
#> [1] 0

A bool query

mmatch <- '{
 "query": {
   "bool" : {
     "must_not" : {
       "range" : {
         "speech_number" : {
           "from" : 1, "to": 5
}}}}}}'
sapply(Search(x, index="shakespeare", body=mmatch)$hits$hits, function(x) x$`_source`$speech_number)
#> [[1]]
#> NULL
#> 
#> [[2]]
#> NULL
#> 
#> [[3]]
#> NULL
#> 
#> [[4]]
#> [1] 6
#> 
#> [[5]]
#> [1] 6
#> 
#> [[6]]
#> [1] 7
#> 
#> [[7]]
#> [1] 7
#> 
#> [[8]]
#> [1] 7
#> 
#> [[9]]
#> [1] 7
#> 
#> [[10]]
#> [1] 7

Fuzzy query

Fuzzy query on numerics

fuzzy <- list(query = list(fuzzy = list(text_entry = "arms")))
Search(x, index="shakespeare", body = fuzzy)$hits$total
#> $value
#> [1] 49
#> 
#> $relation
#> [1] "eq"
fuzzy <- list(query = list(fuzzy = list(text_entry = list(value = "arms", fuzziness = 4))))
Search(x, index="shakespeare", body=fuzzy)$hits$total
#> $value
#> [1] 618
#> 
#> $relation
#> [1] "eq"

Range query

With numeric

body <- list(query=list(range=list(decimalLongitude=list(gte=1, lte=3))))
Search(x, 'gbif', body=body)$hits$total
#> $value
#> [1] 24
#> 
#> $relation
#> [1] "eq"
body <- list(query=list(range=list(decimalLongitude=list(gte=2.9, lte=10))))
Search(x, 'gbif', body=body)$hits$total
#> $value
#> [1] 126
#> 
#> $relation
#> [1] "eq"

With dates

body <- list(query=list(range=list(eventDate=list(gte="2012-01-01", lte="now"))))
Search(x, 'gbif', body=body)$hits$total
#> $value
#> [1] 301
#> 
#> $relation
#> [1] "eq"
body <- list(query=list(range=list(eventDate=list(gte="2014-01-01", lte="now"))))
Search(x, 'gbif', body=body)$hits$total
#> $value
#> [1] 292
#> 
#> $relation
#> [1] "eq"

More-like-this query (more_like_this can be shortened to mlt)

body <- '{
 "query": {
   "more_like_this": {
     "fields": ["abstract","title"],
     "like": "and then",
     "min_term_freq": 1,
     "max_query_terms": 12
   }
 }
}'
Search(x, 'plos', body=body)$hits$total
#> $value
#> [1] 488
#> 
#> $relation
#> [1] "eq"
body <- '{
 "query": {
   "more_like_this": {
     "fields": ["abstract","title"],
     "like": "cell",
     "min_term_freq": 1,
     "max_query_terms": 12
   }
 }
}'
Search(x, 'plos', body=body)$hits$total
#> $value
#> [1] 58
#> 
#> $relation
#> [1] "eq"

Highlighting

body <- '{
 "query": {
   "query_string": {
     "query" : "cell"
   }
 },
 "highlight": {
   "fields": {
     "title": {"number_of_fragments": 2}
   }
 }
}'
out <- Search(x, 'plos', body=body)
out$hits$total
#> $value
#> [1] 58
#> 
#> $relation
#> [1] "eq"
sapply(out$hits$hits, function(x) x$highlight$title[[1]])[8:10]
#> [1] "Functional Analysis of the Drosophila Embryonic Germ <em>Cell</em> Transcriptome by RNA Interference"
#> [2] "Diversin Is Overexpressed in Breast Cancer and Accelerates <em>Cell</em> Proliferation and Invasion" 
#> [3] "c-FLIP Protects Eosinophils from TNF-α-Mediated <em>Cell</em> Death In Vivo"

Scrolling search - instead of paging

Search(x, 'shakespeare', q="a*")$hits$total
#> $value
#> [1] 2747
#> 
#> $relation
#> [1] "eq"
res <- Search(x, index = 'shakespeare', q="a*", time_scroll = "1m")
length(scroll(x, res$`_scroll_id`, time_scroll = "1m")$hits$hits)
#> [1] 10
res <- Search(x, index = 'shakespeare', q = "a*", time_scroll = "5m")
out <- res$hits$hits
hits <- 1
while (hits != 0) {
  res <- scroll(x, res$`_scroll_id`)
  hits <- length(res$hits$hits)
  if (hits > 0)
    out <- c(out, res$hits$hits)
}
length(out)
#> [1] 2747
res$hits$total
#> $value
#> [1] 2747
#> 
#> $relation
#> [1] "eq"

Woohoo! Collected all 2747 documents in very little time.