Strip header and footer content from a Project Gutenberg book. This is based on some formatting guesses so it may not be perfect. It will also not strip tables of contents, prologues, or other text that appears at the start of a book.

gutenberg_strip(text)

Arguments

text

A character vector with lines of a book

Examples


library(dplyr)
book <- gutenberg_works(title == "Pride and Prejudice") %>%
  gutenberg_download(strip = FALSE)
#> Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
#> Using mirror http://aleph.gutenberg.org

head(book$text, 10)
#>  [1] "The Project Gutenberg eBook of Pride and Prejudice, by Jane Austen"      
#>  [2] ""                                                                        
#>  [3] "This eBook is for the use of anyone anywhere in the United States and"   
#>  [4] "most other parts of the world at no cost and with almost no restrictions"
#>  [5] "whatsoever. You may copy it, give it away or re-use it under the terms"  
#>  [6] "of the Project Gutenberg License included with this eBook or online at"  
#>  [7] "www.gutenberg.org. If you are not located in the United States, you"     
#>  [8] "will have to check the laws of the country where you are located before" 
#>  [9] "using this eBook."                                                       
#> [10] ""                                                                        
tail(book$text, 10)
#>  [1] ""                                                                  
#>  [2] "Most people start at our website which has the main PG search"     
#>  [3] "facility: www.gutenberg.org"                                       
#>  [4] ""                                                                  
#>  [5] "This website includes information about Project Gutenberg-tm,"     
#>  [6] "including how to make donations to the Project Gutenberg Literary" 
#>  [7] "Archive Foundation, how to help produce our new eBooks, and how to"
#>  [8] "subscribe to our email newsletter to hear about new eBooks."       
#>  [9] ""                                                                  
#> [10] ""                                                                  

text_stripped <- gutenberg_strip(book$text)

head(text_stripped, 10)
#>  [1] "THERE IS AN ILLUSTRATED EDITION OF THIS TITLE WHICH MAY VIEWED AT EBOOK"
#>  [2] "[# 42671 ]"                                                             
#>  [3] ""                                                                       
#>  [4] "cover"                                                                  
#>  [5] ""                                                                       
#>  [6] ""                                                                       
#>  [7] ""                                                                       
#>  [8] ""                                                                       
#>  [9] "Pride and Prejudice"                                                    
#> [10] ""                                                                       
tail(text_stripped, 10)
#>  [1] "      see how his wife conducted herself; and she condescended to wait"
#>  [2] "      on them at Pemberley, in spite of that pollution which its woods"
#>  [3] "      had received, not merely from the presence of such a mistress,"  
#>  [4] "      but the visits of her uncle and aunt from the city."             
#>  [5] ""                                                                      
#>  [6] "      With the Gardiners, they were always on the most intimate terms."
#>  [7] "      Darcy, as well as Elizabeth, really loved them; and they were"   
#>  [8] "      both ever sensible of the warmest gratitude towards the persons" 
#>  [9] "      who, by bringing her into Derbyshire, had been the means of"     
#> [10] "      uniting them."