Extract tables from a file
extract_tables(file, pages = NULL, area = NULL, columns = NULL, guess = TRUE, method = c("decide", "lattice", "stream"), output = c("matrix", "data.frame", "character", "asis", "csv", "tsv", "json"), outdir = NULL, password = NULL, encoding = NULL, copy = FALSE, ...)
file | A character string specifying the path or URL to a PDF file. |
---|---|
pages | An optional integer vector specifying pages to extract from. |
area | An optional list, of length equal to the number of pages specified, where each entry contains a four-element numeric vector of coordinates (top,left,bottom,right) containing the table for the corresponding page. As a convenience, a list of length 1 can be used to extract the same area from all (specified) pages. Only specify |
columns | An optional list, of length equal to the number of pages specified, where each entry contains a numeric vector of horizontal (x) coordinates separating columns of data for the corresponding page. As a convenience, a list of length 1 can be used to specify the same columns for all (specified) pages. Only specify |
guess | A logical indicating whether to guess the locations of tables on each page. If |
method | A string identifying the prefered method of table extraction.
|
output | A function to coerce the Java response object (a Java ArrayList of Tabula Tables) to some output format. The default method, “matrices”, returns a list of character matrices. See Details for other options. |
outdir | Output directory for files if |
password | Optionally, a character string containing a user password to access a secured PDF. |
encoding | Optionally, a character string specifying an encoding for the text, to be passed to the assignment method of |
copy | Specifies whether the original local file(s) should be copied to
|
... | These are additional arguments passed to the internal functions dispatched by |
By default, a list of character matrices. This can be changed by specifying an alternative value of method
(see Details).
This function mimics the behavior of the Tabula command line utility. It returns a list of R character matrices containing tables extracted from a file by default. This response behavior can be changed by using the following options.
output = "character"
returns a list of single-element character vectors, where each vector is a tab-delimited, line-separate string of concatenated table cells.
output = "data.frame"
attempts to coerce the structure returned by method = "character"
into a list of data.frames and returns character strings where this fails.
output = "csv"
writes the tables to comma-separated (CSV) files using Tabula's CSVWriter method in the same directory as the original PDF. method = "tsv"
does the same but with tab-separated (TSV) files using Tabula's TSVWriter and method = "json"
does the same using Tabula's JSONWriter method. Any of these three methods return the path to the directory containing the extract table files.
output = "asis"
returns the Java object reference, which can be useful for debugging or for writing a custom parser.
extract_areas
implements this functionality in an interactive mode allowing the user to specify extraction areas for each page.
Thomas J. Leeper <[email protected]>, Tom Paskhalis <[email protected]>
# \donttest{ # simple demo file f <- system.file("examples", "data.pdf", package = "tabulizer") # extract all tables extract_tables(f)#> [[1]] #> [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] #> [1,] "mpg" "cyl" "disp" "hp" "drat" "wt" "qsec" "vs" "am" "gear" #> [2,] "21.0" "6" "160.0" "110" "3.90" "2.620" "16.46" "0" "1" "4" #> [3,] "21.0" "6" "160.0" "110" "3.90" "2.875" "17.02" "0" "1" "4" #> [4,] "22.8" "4" "108.0" "93" "3.85" "2.320" "18.61" "1" "1" "4" #> [5,] "21.4" "6" "258.0" "110" "3.08" "3.215" "19.44" "1" "0" "3" #> [6,] "18.7" "8" "360.0" "175" "3.15" "3.440" "17.02" "0" "0" "3" #> [7,] "18.1" "6" "225.0" "105" "2.76" "3.460" "20.22" "1" "0" "3" #> [8,] "14.3" "8" "360.0" "245" "3.21" "3.570" "15.84" "0" "0" "3" #> [9,] "24.4" "4" "146.7" "62" "3.69" "3.190" "20.00" "1" "0" "4" #> [10,] "22.8" "4" "140.8" "95" "3.92" "3.150" "22.90" "1" "0" "4" #> [11,] "19.2" "6" "167.6" "123" "3.92" "3.440" "18.30" "1" "0" "4" #> [12,] "17.8" "6" "167.6" "123" "3.92" "3.440" "18.90" "1" "0" "4" #> [13,] "16.4" "8" "275.8" "180" "3.07" "4.070" "17.40" "0" "0" "3" #> [14,] "17.3" "8" "275.8" "180" "3.07" "3.730" "17.60" "0" "0" "3" #> [15,] "15.2" "8" "275.8" "180" "3.07" "3.780" "18.00" "0" "0" "3" #> [16,] "10.4" "8" "472.0" "205" "2.93" "5.250" "17.98" "0" "0" "3" #> [17,] "10.4" "8" "460.0" "215" "3.00" "5.424" "17.82" "0" "0" "3" #> [18,] "14.7" "8" "440.0" "230" "3.23" "5.345" "17.42" "0" "0" "3" #> [19,] "32.4" "4" "78.7" "66" "4.08" "2.200" "19.47" "1" "1" "4" #> [20,] "30.4" "4" "75.7" "52" "4.93" "1.615" "18.52" "1" "1" "4" #> [21,] "33.9" "4" "71.1" "65" "4.22" "1.835" "19.90" "1" "1" "4" #> [22,] "21.5" "4" "120.1" "97" "3.70" "2.465" "20.01" "1" "0" "3" #> [23,] "15.5" "8" "318.0" "150" "2.76" "3.520" "16.87" "0" "0" "3" #> [24,] "15.2" "8" "304.0" "150" "3.15" "3.435" "17.30" "0" "0" "3" #> [25,] "13.3" "8" "350.0" "245" "3.73" "3.840" "15.41" "0" "0" "3" #> [26,] "19.2" "8" "400.0" "175" "3.08" "3.845" "17.05" "0" "0" "3" #> [27,] "27.3" "4" "79.0" "66" "4.08" "1.935" "18.90" "1" "1" "4" #> [28,] "26.0" "4" "120.3" "91" "4.43" "2.140" "16.70" "0" "1" "5" #> [29,] "30.4" "4" "95.1" "113" "3.77" "1.513" "16.90" "1" "1" "5" #> [30,] "15.8" "8" "351.0" "264" "4.22" "3.170" "14.50" "0" "1" "5" #> [31,] "19.7" "6" "145.0" "175" "3.62" "2.770" "15.50" "0" "1" "5" #> [32,] "15.0" "8" "301.0" "335" "3.54" "3.570" "14.60" "0" "1" "5" #> #> [[2]] #> [,1] [,2] [,3] [,4] [,5] #> [1,] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species" #> [2,] "5.1" "3.5" "1.4" "0.2" "setosa" #> [3,] "4.9" "3.0" "1.4" "0.2" "setosa" #> [4,] "4.7" "3.2" "1.3" "0.2" "setosa" #> [5,] "4.6" "3.1" "1.5" "0.2" "setosa" #> [6,] "5.0" "3.6" "1.4" "0.2" "setosa" #> [7,] "5.4" "3.9" "1.7" "0.4" "setosa" #> #> [[3]] #> [,1] [,2] [,3] [,4] [,5] #> [1,] "" "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" #> [2,] "145" "6.7" "3.3" "5.7" "2.5" #> [3,] "146" "6.7" "3.0" "5.2" "2.3" #> [4,] "147" "6.3" "2.5" "5.0" "1.9" #> [5,] "148" "6.5" "3.0" "5.2" "2.0" #> [6,] "149" "6.2" "3.4" "5.4" "2.3" #> [7,] "150" "5.9" "3.0" "5.1" "1.8" #> [,6] #> [1,] "Species" #> [2,] "virginica" #> [3,] "virginica" #> [4,] "virginica" #> [5,] "virginica" #> [6,] "virginica" #> [7,] "virginica" #> #> [[4]] #> [,1] #> [1,] "supp" #> [2,] "VC" #> [3,] "VC" #> [4,] "VC" #> [5,] "VC" #> [6,] "VC" #> [7,] "VC" #> [8,] "VC" #> [9,] "VC" #> [10,] "VC" #> [11,] "VC" #> [12,] "VC" #> [13,] "VC" #> [14,] "VC" #> [15,] "VC" #># extract tables from only second page extract_tables(f, pages = 2)#> [[1]] #> [,1] [,2] [,3] [,4] [,5] #> [1,] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species" #> [2,] "5.1" "3.5" "1.4" "0.2" "setosa" #> [3,] "4.9" "3.0" "1.4" "0.2" "setosa" #> [4,] "4.7" "3.2" "1.3" "0.2" "setosa" #> [5,] "4.6" "3.1" "1.5" "0.2" "setosa" #> [6,] "5.0" "3.6" "1.4" "0.2" "setosa" #> [7,] "5.4" "3.9" "1.7" "0.4" "setosa" #> #> [[2]] #> [,1] [,2] [,3] [,4] [,5] #> [1,] "" "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" #> [2,] "145" "6.7" "3.3" "5.7" "2.5" #> [3,] "146" "6.7" "3.0" "5.2" "2.3" #> [4,] "147" "6.3" "2.5" "5.0" "1.9" #> [5,] "148" "6.5" "3.0" "5.2" "2.0" #> [6,] "149" "6.2" "3.4" "5.4" "2.3" #> [7,] "150" "5.9" "3.0" "5.1" "1.8" #> [,6] #> [1,] "Species" #> [2,] "virginica" #> [3,] "virginica" #> [4,] "virginica" #> [5,] "virginica" #> [6,] "virginica" #> [7,] "virginica" #># extract areas from a page ## full table extract_tables(f, pages = 2, area = list(c(126, 149, 212, 462)))#> [[1]] #> [,1] [,2] [,3] [,4] [,5] #> [1,] "" "Sepal.Width" "Petal.Length" "Petal.Width" "Species" #> [2,] "5.1" "3.5" "1.4" "0.2" "setosa" #> [3,] "4.9" "3.0" "1.4" "0.2" "setosa" #> [4,] "4.7" "3.2" "1.3" "0.2" "setosa" #> [5,] "4.6" "3.1" "1.5" "0.2" "setosa" #> [6,] "5.0" "3.6" "1.4" "0.2" "setosa" #> [7,] "3.9" "1.7" "0.4" "setosa" "" #>#> [[1]] #> [,1] [,2] #> [1,] "" "Petal.Width" #> [2,] "1.4" "0.2" #> [3,] "1.4" "0.2" #># return data.frames extract_tables(f, pages = 2, output = "data.frame")#> [[1]] #> Sepal.Length Sepal.Width Petal.Length Petal.Width Species #> 1 5.1 3.5 1.4 0.2 setosa #> 2 4.9 3.0 1.4 0.2 setosa #> 3 4.7 3.2 1.3 0.2 setosa #> 4 4.6 3.1 1.5 0.2 setosa #> 5 5.0 3.6 1.4 0.2 setosa #> 6 5.4 3.9 1.7 0.4 setosa #> #> [[2]] #> X Sepal.Length Sepal.Width Petal.Length Petal.Width Species #> 1 145 6.7 3.3 5.7 2.5 virginica #> 2 146 6.7 3.0 5.2 2.3 virginica #> 3 147 6.3 2.5 5.0 1.9 virginica #> 4 148 6.5 3.0 5.2 2.0 virginica #> 5 149 6.2 3.4 5.4 2.3 virginica #> 6 150 5.9 3.0 5.1 1.8 virginica #># }