rcrossref introduction

Installation

Install stable version from CRAN

install.packages("rcrossref")

Or development version from GitHub

remotes::install_github("ropensci/rcrossref")

library("rcrossref")

Citation search

CrossRef’s DOI Content Negotiation service, where you can citations back in various formats, including apa

cr_cn(dois = "10.1371/journal.pone.0112608", format = "text", style = "apa")
#> [1] "Wang, Q., & Taylor, J. E. (2014). Quantifying Human Mobility Perturbation and Resilience in Hurricane Sandy. PLoS ONE, 9(11), e112608. doi:10.1371/journal.pone.0112608"

There are a lot more styles. We include a dataset as a character vector within the package, accessible via the get_styles() function, e.g.,

get_styles()[1:5]
#> [1] "academy-of-management-review"        
#> [2] "accident-analysis-and-prevention"    
#> [3] "aci-materials-journal"               
#> [4] "acm-sig-proceedings-long-author-list"
#> [5] "acm-sig-proceedings"

bibtex

cat(cr_cn(dois = "10.1126/science.169.3946.635", format = "bibtex"))
#> @article{Frank_1970,
#>  doi = {10.1126/science.169.3946.635},
#>  url = {https://doi.org/10.1126%2Fscience.169.3946.635},
#>  year = 1970,
#>  month = {aug},
#>  publisher = {American Association for the Advancement of Science ({AAAS})},
#>  volume = {169},
#>  number = {3946},
#>  pages = {635--641},
#>  author = {H. S. Frank},
#>  title = {The Structure of Ordinary Water: New data and interpretations are yielding new insights into this fascinating substance},
#>  journal = {Science}
#> }

bibentry

cr_cn(dois = "10.6084/m9.figshare.97218", format = "bibentry")
#> $doi
#> [1] "10.6084/M9.FIGSHARE.97218"
#> 
#> $url
#> [1] "https://figshare.com/articles/thesis/Regime_shifts_in_ecology_and_evolution_(PhD_Dissertation)/97218"
#> 
#> $author
#> [1] "Boettiger, Carl"
#> 
#> $keywords
#> [1] "Evolutionary Biology, FOS: Biological sciences, FOS: Biological sciences, Ecology"
#> 
#> $title
#> [1] "Regime shifts in ecology and evolution (PhD Dissertation)"
#> 
#> $publisher
#> [1] "figshare"
#> 
#> $year
#> [1] "2012"
#> 
#> $copyright
#> [1] "Creative Commons Attribution 4.0 International"
#> 
#> $key
#> [1] "https://doi.org/10.6084/m9.figshare.97218"
#> 
#> $entry
#> [1] "article"

Citation count

Citation count, using OpenURL

cr_citation_count(doi="10.1371/journal.pone.0042793")
#>                            doi count
#> 1 10.1371/journal.pone.0042793    43

Search Crossref metadata API

The following functions all use the CrossRef API.

Look up funder information

cr_funders(query="NSF")
#> $meta
#>   total_results search_terms start_index items_per_page
#> 1            39          NSF           0             20
#> 
#> $data
#> # A tibble: 20 × 6
#>    id           location       name         alt.names       uri     tokens      
#>    <chr>        <chr>          <chr>        <chr>           <chr>   <chr>       
#>  1 100000001    United States  National Sc… "USA NSF, NSF,… http:/… national, s…
#>  2 100015388    United States  Kansas NSF … "KNE, NSF EPSC… http:/… kansas, nsf…
#>  3 100016323    United States  Arkansas NS… "Arkansas EPSC… http:/… arkansas, n…
#>  4 100003187    United States  National Sl… "NSF"           http:/… national, s…
#>  5 501100000930 Australia      National St… "NSF"           http:/… national, s…
#>  6 501100004190 Norway         Norsk Sykep… "NSF, Norwegia… http:/… norsk, syke…
#>  7 501100020414 United Kingdom Neuroscienc… "The Neuroscie… http:/… neuroscienc…
#>  8 100000154    United States  Division of… "IOS, NSF Divi… http:/… division, o…
#>  9 100017338    China          Key Program… ""              http:/… key, progra…
#> 10 100000084    United States  Directorate… "NSF Directora… http:/… directorate…
#> 11 100016620    United States  Nick Simons… "NSF, The Nick… http:/… nick, simon…
#> 12 100006445    United States  Center for … "CHM, NSF, Uni… http:/… center, for…
#> 13 100017325    United States  Engineering… "ERC, The NSF … http:/… engineering…
#> 14 100008367    Denmark        Statens Nat… "Danish Nation… http:/… statens, na…
#> 15 100000179    United States  Office of t… "NSF Office of… http:/… office, of,…
#> 16 501100019492 China          National Na… "NSFC-General … http:/… national, n…
#> 17 501100011002 China          National Na… "NSFC-Yunnan J… http:/… national, n…
#> 18 501100008982 Sri Lanka      National Sc… "National Scie… http:/… national, s…
#> 19 501100001809 China          National Na… "NNSF of China… http:/… national, n…
#> 20 501100014220 China          National Na… "NSFC-Henan Jo… http:/… national, n…
#> 
#> $facets
#> NULL

Check the DOI minting agency

cr_agency(dois = '10.13039/100000001')
#> $DOI
#> [1] "10.13039/100000001"
#> 
#> $agency
#> $agency$id
#> [1] "crossref"
#> 
#> $agency$label
#> [1] "Crossref"

Search works (i.e., articles, books, etc.)

cr_works(filter=c(has_orcid=TRUE, from_pub_date='2004-04-04'), limit=1)
#> $meta
#>   total_results search_terms start_index items_per_page
#> 1       6556799           NA           0              1
#> 
#> $data
#> # A tibble: 1 × 26
#>   container.title created  deposited  published.online doi   indexed issn  issue
#>   <chr>           <chr>    <chr>      <chr>            <chr> <chr>   <chr> <chr>
#> 1 Belügyi Szemle  2021-04… 2021-04-26 2021-04-26       10.3… 2021-0… 2677… 4    
#> # … with 18 more variables: issued <chr>, member <chr>, page <chr>,
#> #   prefix <chr>, publisher <chr>, score <chr>, source <chr>,
#> #   reference.count <chr>, references.count <chr>,
#> #   is.referenced.by.count <chr>, title <chr>, type <chr>, url <chr>,
#> #   volume <chr>, abstract <chr>, short.container.title <chr>, author <list>,
#> #   link <list>
#> 
#> $facets
#> NULL

Search journals

cr_journals(issn=c('1803-2427','2326-4225'))
#> $data
#> # A tibble: 2 × 53
#>   title    publisher   issn   last_status_che… deposits_abstra… deposits_orcids…
#>   <chr>    <chr>       <chr>  <date>           <lgl>            <lgl>           
#> 1 Journal… De Gruyter… 1803-… 2021-08-18       TRUE             TRUE            
#> 2 Journal… American S… 2326-… 2021-08-18       FALSE            FALSE           
#> # … with 47 more variables: deposits <lgl>,
#> #   deposits_affiliations_backfile <lgl>,
#> #   deposits_update_policies_backfile <lgl>,
#> #   deposits_similarity_checking_backfile <lgl>,
#> #   deposits_award_numbers_current <lgl>,
#> #   deposits_resource_links_current <lgl>, deposits_articles <lgl>,
#> #   deposits_affiliations_current <lgl>, deposits_funders_current <lgl>, …
#> 
#> $facets
#> NULL

Search license information

cr_licenses(query = 'elsevier')
#> $meta
#>   total_results search_terms start_index items_per_page
#> 1            52           NA          NA             NA
#> 
#> $data
#> # A tibble: 20 × 2
#>    URL                                                                    work.count
#>    <chr>                                                                       <int>
#>  1 http://aspb.org/publications/aspb-journals/open-articles                        1
#>  2 http://creativecommons.org/licenses/by-nc-nd/3.0                                3
#>  3 http://creativecommons.org/licenses/by-nc-nd/3.0/                              11
#>  4 http://creativecommons.org/licenses/by-nc-nd/4.0/                              21
#>  5 http://creativecommons.org/licenses/by-nc/4.0/                                  6
#>  6 http://creativecommons.org/licenses/by-sa/4.0                                   1
#>  7 http://creativecommons.org/licenses/by/2.0                                      2
#>  8 http://creativecommons.org/licenses/by/3.0                                      2
#>  9 http://creativecommons.org/licenses/by/3.0/                                     2
#> 10 http://creativecommons.org/licenses/by/3.0/igo/                                 1
#> 11 http://creativecommons.org/licenses/by/4.0                                     11
#> 12 http://creativecommons.org/licenses/by/4.0/                                    22
#> 13 http://doi.wiley.com/10.1002/tdm_license_1                                    136
#> 14 http://doi.wiley.com/10.1002/tdm_license_1.1                                 2255
#> 15 http://iopscience.iop.org/info/page/text-and-data-mining                        2
#> 16 http://iopscience.iop.org/page/copyright                                        2
#> 17 http://journals.iucr.org/services/copyrightpolicy.html                         11
#> 18 http://journals.iucr.org/services/copyrightpolicy.html#TDM                     11
#> 19 http://journals.sagepub.com/page/policies/text-and-data-mining-license        365
#> 20 http://onlinelibrary.wiley.com/termsAndConditions                              63

Search based on DOI prefixes

cr_prefixes(prefixes=c('10.1016','10.1371','10.1023','10.4176','10.1093'))
#> $meta
#> NULL
#> 
#> $data
#>                               member                                    name
#> 1   http://id.crossref.org/member/78                             Elsevier BV
#> 2  http://id.crossref.org/member/340        Public Library of Science (PLoS)
#> 3  http://id.crossref.org/member/297 Springer Science and Business Media LLC
#> 4 http://id.crossref.org/member/1989                    Co-Action Publishing
#> 5  http://id.crossref.org/member/286           Oxford University Press (OUP)
#>                                  prefix
#> 1 http://id.crossref.org/prefix/10.1016
#> 2 http://id.crossref.org/prefix/10.1371
#> 3 http://id.crossref.org/prefix/10.1023
#> 4 http://id.crossref.org/prefix/10.4176
#> 5 http://id.crossref.org/prefix/10.1093
#> 
#> $facets
#> list()

Search CrossRef members

cr_members(query='ecology', limit = 5)
#> $meta
#>   total_results search_terms start_index items_per_page
#> 1            24      ecology           0              5
#> 
#> $data
#> # A tibble: 5 × 56
#>      id primary_name    location     last_status_che… current.dois backfile.dois
#>   <int> <chr>           <chr>        <date>           <chr>        <chr>        
#> 1  4302 Immediate Scie… Toronto, ON… 2021-08-18       0            6            
#> 2  6933 Knowledge Ecol… Washington,… 2021-08-18       0            1            
#> 3  1950 Journal of Vec… United Stat… 2021-08-18       0            0            
#> 4  2899 Association fo… Eugene, OR,… 2021-08-18       0            0            
#> 5  7745 Institute of A… Makhachkala… 2021-08-18       172          678          
#> # … with 50 more variables: total.dois <chr>, prefixes <chr>,
#> #   coverge.affiliations.current <chr>,
#> #   coverge.similarity.checking.current <chr>, coverge.funders.backfile <chr>,
#> #   coverge.licenses.backfile <chr>, coverge.funders.current <chr>,
#> #   coverge.affiliations.backfile <chr>, coverge.resource.links.backfile <chr>,
#> #   coverge.orcids.backfile <chr>, coverge.update.policies.current <chr>,
#> #   coverge.open.references.backfile <chr>, coverge.orcids.current <chr>, …
#> 
#> $facets
#> NULL

Get N random DOIs

cr_r() uses the function cr_works() internally.

cr_r()
#>  [1] "10.1111/biot.1990.4.issue-2"           
#>  [2] "10.7717/peerj.5714/table-5"            
#>  [3] "10.1061/(asce)0733-947x(2008)134:1(34)"
#>  [4] "10.1016/j.clml.2015.04.108"            
#>  [5] "10.1371/journal.pntd.0009063.s016"     
#>  [6] "10.31616/asj.2020.0425"                
#>  [7] "10.15446/revfacmed.v65n3.61313"        
#>  [8] "10.1586/ecp.09.36"                     
#>  [9] "10.1007/978-3-8348-9174-7_1"           
#> [10] "10.1080/09511920601160171"

You can pass in the number of DOIs you want back (default is 10)

cr_r(2)
#> [1] "10.5194/gmd-2021-94-supplement" "10.1108/prt.2009.12938bad.009"

Get full text

Publishers can optionally provide links in the metadata they provide to Crossref for full text of the work, but that data is often missing. Find out more about it at https://support.crossref.org/hc/en-us/articles/215750183-Crossref-Text-and-Data-Mining-Services

Get some DOIs for articles that provide full text, and that have CC-BY 3.0 licenses (i.e., more likely to actually be open)

out <-
  cr_works(filter = list(has_full_text = TRUE,
    license_url = "http://creativecommons.org/licenses/by/3.0/"))
(dois <- out$data$doi)
#>  [1] "10.1155/jamsa/2006/42542"        "10.1016/s0370-2693(02)01651-9"  
#>  [3] "10.1016/s0370-2693(02)01624-6"   "10.1155/ijmms/2006/89545"       
#>  [5] "10.1016/s0370-2693(01)01058-9"   "10.1016/s0370-2693(01)01257-6"  
#>  [7] "10.1016/s0370-2693(01)01287-4"   "10.1016/s0370-2693(01)01385-5"  
#>  [9] "10.1002/cfg.80"                  "10.1002/cfg.118"                
#> [11] "10.1002/cfg.166"                 "10.1002/cfg.59"                 
#> [13] "10.1002/cfg.108"                 "10.1088/1742-6596/1147/1/012044"
#> [15] "10.1088/1742-6596/1147/1/012077" "10.1088/1742-6596/578/1/012006" 
#> [17] "10.1088/1755-1315/222/1/012020"  "10.1002/ecs2.2575"              
#> [19] "10.1088/1755-1315/214/1/012004"  "10.1088/1755-1315/214/1/012021"

From the output of cr_works we can get full text links if we know where to look:

do.call("rbind", out$data$link)
#> # A tibble: 55 × 4
#>    URL                           content.type  content.version intended.applica…
#>    <chr>                         <chr>         <chr>           <chr>            
#>  1 http://downloads.hindawi.com… application/… vor             text-mining      
#>  2 http://downloads.hindawi.com… unspecified   vor             similarity-check…
#>  3 https://api.elsevier.com/con… text/xml      vor             text-mining      
#>  4 https://api.elsevier.com/con… text/plain    vor             text-mining      
#>  5 https://api.elsevier.com/con… text/xml      vor             text-mining      
#>  6 https://api.elsevier.com/con… text/plain    vor             text-mining      
#>  7 http://downloads.hindawi.com… application/… vor             text-mining      
#>  8 http://downloads.hindawi.com… unspecified   vor             similarity-check…
#>  9 https://api.elsevier.com/con… text/xml      vor             text-mining      
#> 10 https://api.elsevier.com/con… text/plain    vor             text-mining      
#> # … with 45 more rows

From there, you can grab your full text, but because most links require authentication, enter another package: crminer.

You’ll need package crminer for the rest of the work.

Onc we have DOIs, get URLs to full text content

if (!requireNamespace("crminer")) {
  install.packages("crminer")
}

library(crminer)
#> Error in library(crminer): there is no package called 'crminer'
(links <- crm_links("10.1155/2014/128505"))
#> Error in crm_links("10.1155/2014/128505"): could not find function "crm_links"

Then use those URLs to get full text

crm_pdf(links)
#> <document>/Users/sckott/Library/Caches/R/crminer/128505.pdf
#>   Pages: 1
#>   No. characters: 1565
#>   Created: 2014-09-15

See also fulltext (https://github.com/ropensci/fulltext) for getting scholarly text for text mining.

Scott Chamberlain

2021-08-19