Coordinate based cleaning

coord_incomplete(x, lat = NULL, lon = NULL, drop = TRUE)

coord_imprecise(x, which = "both", lat = NULL, lon = NULL, drop = TRUE)

coord_impossible(x, lat = NULL, lon = NULL, drop = TRUE)

coord_unlikely(x, lat = NULL, lon = NULL, drop = TRUE)

coord_within(
  x,
  field = NULL,
  country = NULL,
  lat = NULL,
  lon = NULL,
  drop = TRUE
)

coord_pol_centroids(x, lat = NULL, lon = NULL, drop = TRUE)

coord_uncertain(
  x,
  coorduncertainityLimit = 30000,
  drop = TRUE,
  ignore.na = FALSE
)

Arguments

x

(data.frame) A data.frame

lat, lon

(character) Latitude and longitude column to use. See Details.

drop

(logical) Drop bad data points or not. Either way, we parse out bad data points as an attribute you can access. Default: TRUE

which

(character) one of "has_dec", "no_zeros", or "both" (default)

field

(character) Name of field in input data.frame x with country names

country

(character) A single country name

coorduncertainityLimit

(numeric) numeric threshold for the coordinateUncertainityInMeters variable. Default: 30000

ignore.na

(logical) To consider NA values as a bad point or not. Default: FALSE

Value

Returns a data.frame, with attributes

Details

Explanation of the functions:

  • coord_impossible - Impossible coordinates

  • coord_incomplete - Incomplete coordinates

  • coord_imprecise - Imprecise coordinates

  • coord_pol_centroids - Points at political centroids

  • coord_unlikely - Unlikely coordinates

  • coord_within - Filter points within user input political boundaries

  • coord_uncertain - Uncertain occurrances of measured through coordinateUncertaintyInMeters default limit= 30000

If either lat or lon (or both) given, we assign the given column name to be standardized names of "latitude", and "longitude". If not given, we attempt to guess what the lat and lon column names are and assign the same standardized names. Assigning the same standardized names makes downstream processing easier so that we're dealing with consistent column names. On returning the data, we return the original names.

For coord_within, we use countriesLow dataset from the rworldmap package to get country borders.

coord_pol_centroids

Right now, this function only deals with city centroids, using the maps::world.cities dataset of more than 40,000 cities. We'll work on adding country centroids, and perhaps others (e.g., counties, states, provinces, parks, etc.).

Examples

df <- sample_data_1 # Remove impossible coordinates NROW(df)
#> [1] 1500
df[1, "latitude"] <- 170 df <- dframe(df) %>% coord_impossible()
#> Assuming 'latitude' is latitude
#> Assuming 'longitude' is longitude
NROW(df)
#> [1] 1499
attr(df, "coord_impossible")
#> # A tibble: 1 x 5 #> name longitude latitude date key #> <chr> <dbl> <dbl> <dttm> <int> #> 1 Ursus americanus -79.7 170 2015-01-14 16:36:45 1065590124
# Remove incomplete cases NROW(df)
#> [1] 1499
df_inc <- dframe(df) %>% coord_incomplete() NROW(df_inc)
#> [1] 1305
attr(df_inc, "coord_incomplete")
#> # A tibble: 194 x 5 #> name longitude latitude date key #> <chr> <dbl> <dbl> <dttm> <int> #> 1 <NA> NA NA NA NA #> 2 <NA> NA NA NA NA #> 3 <NA> NA NA NA NA #> 4 <NA> NA NA NA NA #> 5 <NA> NA NA NA NA #> 6 <NA> NA NA NA NA #> 7 <NA> NA NA NA NA #> 8 <NA> NA NA NA NA #> 9 <NA> NA NA NA NA #> 10 <NA> NA NA NA NA #> # … with 184 more rows
# Remove imprecise cases df <- sample_data_5 NROW(df)
#> [1] 39
## remove records that don't have decimals at all df_imp <- dframe(df) %>% coord_imprecise(which = "has_dec")
#> Assuming 'latitude' is latitude
#> Assuming 'longitude' is longitude
NROW(df_imp)
#> [1] 33
attr(df_imp, "coord_imprecise")
#> # A tibble: 6 x 5 #> name longitude latitude date key #> <chr> <chr> <chr> <chr> <int> #> 1 Ursus americanus -123.829 40 2015-03-28T23:00:00Z 1132403409 #> 2 Ursus americanus -103.32315 29 2015-05-23T20:11:29Z 1092907160 #> 3 Ursus americanus -110 44.59889 2015-05-02T22:00:00Z 1092897387 #> 4 Ursus americanus -109 31 2015-05-29T15:02:01Z 1098900981 #> 5 Ursus americanus -92.29988 49 2015-05-16T16:27:20Z 1132401463 #> 6 Ursus americanus -110.0 44 2015-05-10T18:05:40Z 1088979156
## remove records that have all zeros df_imp <- dframe(df) %>% coord_imprecise(which = "no_zeros")
#> Assuming 'latitude' is latitude
#> Assuming 'longitude' is longitude
NROW(df_imp)
#> [1] 28
attr(df_imp, "coord_imprecise")
#> # A tibble: 11 x 5 #> name longitude latitude date key #> <chr> <chr> <chr> <chr> <int> #> 1 Ursus americanus -78.25027 36.0000 2015-03-20T21:11:24Z 1088923534 #> 2 Ursus americanus -72.18191 44.0000 2015-04-16T16:16:30Z 1088950245 #> 3 Ursus americanus -100.0000 25.6065 2015-04-06T22:00:00Z 1099969743 #> 4 Ursus americanus -115.0000 51.22819 2015-05-14T01:15:34Z 1092879993 #> 5 Ursus americanus -110.0000 31.87244 2015-05-08T16:49:41Z 1099958687 #> 6 Ursus americanus -72.0000 43.80838 2015-05-27T11:09:14Z 1092905944 #> 7 Ursus americanus -102.00 28.57147 2015-05-05T22:00:00Z 1088972152 #> 8 Ursus americanus -103.0 29.26837 2015-05-23T20:43:13Z 1092907155 #> 9 Ursus americanus -81.0 46.03828 2015-05-29T21:23:21Z 1099959710 #> 10 Ursus americanus -81.30308 46.0 2015-05-21T21:21:25Z 1092890386 #> 11 Ursus americanus -110.0 44 2015-05-10T18:05:40Z 1088979156
## remove both records that don't have decimals at all and those that ## have all zeros df_imp <- dframe(df) %>% coord_imprecise(which = "both")
#> Assuming 'latitude' is latitude
#> Assuming 'longitude' is longitude
NROW(df_imp)
#> [1] 23
attr(df_imp, "coord_imprecise")
#> # A tibble: 16 x 5 #> name longitude latitude date key #> <chr> <chr> <chr> <chr> <int> #> 1 Ursus americanus -123.829 40 2015-03-28T23:00:00Z 1132403409 #> 2 Ursus americanus -103.32315 29 2015-05-23T20:11:29Z 1092907160 #> 3 Ursus americanus -110 44.59889 2015-05-02T22:00:00Z 1092897387 #> 4 Ursus americanus -109 31 2015-05-29T15:02:01Z 1098900981 #> 5 Ursus americanus -92.29988 49 2015-05-16T16:27:20Z 1132401463 #> 6 Ursus americanus -110.0 44 2015-05-10T18:05:40Z 1088979156 #> 7 Ursus americanus -78.25027 36.0000 2015-03-20T21:11:24Z 1088923534 #> 8 Ursus americanus -72.18191 44.0000 2015-04-16T16:16:30Z 1088950245 #> 9 Ursus americanus -100.0000 25.6065 2015-04-06T22:00:00Z 1099969743 #> 10 Ursus americanus -115.0000 51.22819 2015-05-14T01:15:34Z 1092879993 #> 11 Ursus americanus -110.0000 31.87244 2015-05-08T16:49:41Z 1099958687 #> 12 Ursus americanus -72.0000 43.80838 2015-05-27T11:09:14Z 1092905944 #> 13 Ursus americanus -102.00 28.57147 2015-05-05T22:00:00Z 1088972152 #> 14 Ursus americanus -103.0 29.26837 2015-05-23T20:43:13Z 1092907155 #> 15 Ursus americanus -81.0 46.03828 2015-05-29T21:23:21Z 1099959710 #> 16 Ursus americanus -81.30308 46.0 2015-05-21T21:21:25Z 1092890386
# Remove unlikely points NROW(df)
#> [1] 39
df_unlikely <- dframe(df) %>% coord_unlikely()
#> Assuming 'latitude' is latitude
#> Assuming 'longitude' is longitude
NROW(df_unlikely)
#> [1] 39
attr(df_unlikely, "coord_unlikely")
#> [1] NA
# Remove points not within correct political borders if (requireNamespace("rgbif", quietly = TRUE) && interactive()) { library("rgbif") wkt <- 'POLYGON((30.1 10.1,40 40,20 40,10 20,30.1 10.1))' res <- rgbif::occ_data(geometry = wkt, limit=300)$data } else { res <- sample_data_4 }
#> Warning: replacing previous import ‘vctrs::data_frame’ by ‘tibble::data_frame’ when loading ‘dplyr’
## By specific country name NROW(res)
#> [1] 100
df_within <- dframe(res) %>% coord_within(country = "Israel")
#> Assuming 'decimalLatitude' is latitude
#> Assuming 'decimalLongitude' is longitude
#> although coordinates are longitude/latitude, st_within assumes that they are planar
NROW(df_within)
#> [1] 97
attr(df_within, "coord_within")
#> # A tibble: 3 x 6 #> name latitude longitude key eventDate country #> <chr> <dbl> <dbl> <int> <chr> <chr> #> 1 Dasyatis pastinaca 27.3 33.8 1.25e9 2016-01-17T12:51:00.… Egypt #> 2 Dasyatis pastinaca 27.3 33.8 1.25e9 2016-01-17T09:37:00.… Egypt #> 3 Streptopelia seneg… 29.5 35.1 1.25e9 2016-02-06T23:00:00.… Jordan
## By a field in your data - makes sure your points occur in one ## of those countries NROW(res)
#> [1] 100
df_within <- dframe(res) %>% coord_within(field = "country")
#> Assuming 'decimalLatitude' is latitude
#> Assuming 'decimalLongitude' is longitude
#> although coordinates are longitude/latitude, st_within assumes that they are planar
NROW(df_within)
#> [1] 98
head(df_within)
#> # A tibble: 6 x 6 #> name decimalLatitude decimalLongitude key eventDate country #> <chr> <dbl> <dbl> <int> <chr> <chr> #> 1 Drimia mari… 32.9 35.7 1.25e9 2016-01-29T07:1… Israel #> 2 Cyclamen pe… 32.8 35.7 1.23e9 2016-01-22T10:4… Israel #> 3 Mandragora … 32.8 35.7 1.23e9 2016-01-22T10:2… Israel #> 4 Spalax ehre… 32.9 35.6 1.25e9 2016-01-29T11:1… Israel #> 5 Erinaceus c… 32.8 35.7 1.25e9 2016-01-29T09:2… Israel #> 6 Procavia ca… 33.1 35.6 1.23e9 2016-01-04T14:0… Israel
attr(df_within, "coord_within")
#> # A tibble: 2 x 6 #> name latitude longitude key eventDate country #> <chr> <dbl> <dbl> <int> <chr> <chr> #> 1 Dasyatis pastin… 27.3 33.8 1.25e9 2016-01-17T12:51:00.000… Egypt #> 2 Dasyatis pastin… 27.3 33.8 1.25e9 2016-01-17T09:37:00.000… Egypt
# Remove those very near political centroids ## not ready yet # NROW(df) # df_polcent <- dframe(df) %>% coord_pol_centroids() # NROW(df_polcent) # attr(df_polcent, "coord_polcent") ## lat/long column names can vary df <- sample_data_1 head(df)
#> name longitude latitude date key #> 1 Ursus americanus -79.68283 38.36662 2015-01-14 16:36:45 1065590124 #> 2 Ursus americanus -82.42028 35.73304 2015-01-13 00:25:39 1065588899 #> 3 Ursus americanus -99.09625 23.66893 2015-02-20 23:00:00 1098894889 #> 4 Ursus americanus -72.77432 43.94883 2015-02-13 16:16:41 1065611122 #> 5 Ursus americanus -72.34617 43.86464 2015-03-01 20:20:45 1088908315 #> 6 Ursus americanus -108.53674 32.65219 2015-03-29 17:06:54 1088932238
names(df)[2:3] <- c('mylon', 'mylat') head(df)
#> name mylon mylat date key #> 1 Ursus americanus -79.68283 38.36662 2015-01-14 16:36:45 1065590124 #> 2 Ursus americanus -82.42028 35.73304 2015-01-13 00:25:39 1065588899 #> 3 Ursus americanus -99.09625 23.66893 2015-02-20 23:00:00 1098894889 #> 4 Ursus americanus -72.77432 43.94883 2015-02-13 16:16:41 1065611122 #> 5 Ursus americanus -72.34617 43.86464 2015-03-01 20:20:45 1088908315 #> 6 Ursus americanus -108.53674 32.65219 2015-03-29 17:06:54 1088932238
df[1, "mylat"] <- 170 dframe(df) %>% coord_impossible(lat = "mylat", lon = "mylon")
#> # A tibble: 1,499 x 5 #> name mylon mylat date key #> * <chr> <dbl> <dbl> <dttm> <int> #> 1 Ursus americanus -82.4 35.7 2015-01-13 00:25:39 1065588899 #> 2 Ursus americanus -99.1 23.7 2015-02-20 23:00:00 1098894889 #> 3 Ursus americanus -72.8 43.9 2015-02-13 16:16:41 1065611122 #> 4 Ursus americanus -72.3 43.9 2015-03-01 20:20:45 1088908315 #> 5 Ursus americanus -109. 32.7 2015-03-29 17:06:54 1088932238 #> 6 Ursus americanus -109. 32.7 2015-03-29 17:12:50 1088932273 #> 7 Ursus americanus -124. 40.1 2015-03-28 23:00:00 1132403409 #> 8 Ursus americanus -78.3 36.9 2015-03-20 21:11:24 1088923534 #> 9 Ursus americanus -76.8 35.5 2015-04-05 23:00:00 1088954559 #> 10 Ursus americanus -103. 29.3 2015-04-29 22:00:00 1088964797 #> # … with 1,489 more rows
df <- sample_data_6 # Remove uncertain occurances NROW(df)
#> [1] 50
df1<-df %>% coord_uncertain() NROW(df1)
#> [1] 38
attr(df, "coord_uncertain")
#> NULL
NROW(df)
#> [1] 50
df2<-df %>% coord_uncertain(coorduncertainityLimit = 20000) NROW(df2)
#> [1] 38
NROW(df)
#> [1] 50
df3<-df %>% coord_uncertain(coorduncertainityLimit = 20000,ignore.na=TRUE) NROW(df3)
#> [1] 22