library("dplyr")
library("sf")
library("readr")
library("ggplot2")
library("plotly")
library("purrr")
library("janitor")
library("lubridate")
options("sf_max.plot" = 1)
1 Prepare Ticks data
The tick data is user generated with the TickApp, see Section 1 .
<- read_csv("data-raw/classified/tick-reports.csv") tick_reports
Rows: 68375 Columns: 17
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): date, uuid, comment
dbl (14): ID, Lat., Lon., x, y, acc., date acc., body part, report type, age...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
<- "data-raw/public/swissTLM/swissBOUNDARIES3D_1_4_LV95_LN02.gdb"
swissboundaries_path # st_layers(swissboundaries_path)
<- read_sf(swissboundaries_path, "TLM_LANDESGEBIET") |>
switzerland st_zm() |>
filter(NAME != "Liechtenstein") |>
st_union() |>
st_transform(2056)
# clean column names and format columns
<- tick_reports |>
tick_reports ::clean_names() |>
janitormutate(across(c(id, x, y, acc, body_part, report_type, age, gender, host, pickup, deleted), ~ as.integer(.x))) |>
mutate(across(c(lat, lon, date_acc), ~ as.numeric(.x))) |>
mutate(
datetime = as.POSIXct(date, format = "%Y-%m-%d %H:%M:%S"),
date = as.Date(datetime)
)
# remove rows without a data or data is older than 2015
<- tick_reports |>
tick_reports filter(!is.na(date)) |>
filter(date > "2015-01-01")
# remove reports outside a oversized bounding box
# This is redundant, as the data will be filtered by swissBOUNDARIES3D later
<- 485000
XMIN_21781 <- 834000
XMAX_21781 <- 296000
YMAX_21781 <- 075000
YMIN_21781
<- tick_reports %>%
tick_reports filter(x < XMAX_21781, x > XMIN_21781) %>%
filter(y < YMAX_21781, y > YMIN_21781)
# remove reports without a uuid or reports that are marked as deleted
<- tick_reports %>%
tick_reports filter(uuid != "") %>%
filter(deleted != 1)
# till now, all steps are relatively straightforward. Now, opinions start to
# matter.
# remove reports with a spatial accuracy of more than 1 km radius
<- tick_reports |>
tick_reports filter(acc < 1000)
# this step is not necessary anymore, I'll keep it here to document
# default acc values. There are more default values it seems, as can be seen
# when visualizing the data as a histogram
<- tick_reports %>%
tick_reports filter(!(acc %in% c(57274L, 64434L, 1014L)))
<- st_as_sf(tick_reports, coords = c("lon", "lat"), crs = 4326)
tick_reports_sf
# I dont know who came up with this, but it doesn't seem to be the case that there
# are more locations near the default locations than outside
<- data.frame(lat = c(47.3647, 46.81), lon = c(8.5534, 8.23)) |>
default_locations st_as_sf(coords = c("lon", "lat"), crs = 4326)
<- st_is_within_distance(tick_reports_sf, default_locations, 1000) |>
is_default map_lgl(\(x)length(x) > 0)
sum(is_default)
[1] 23
# date accuracy only has two values: 43'200 and 432'000 (0.5 and 5 days?)
# There are only 8k reports for the higher value, we can discard these
table(tick_reports$date_acc)
43200 432000
39959 8253
<- tick_reports |>
tick_reports filter(date_acc < 50000)
$date_acc <- NULL
tick_reports$body_part <- NULL
tick_reports$report_type <- NULL
tick_reports$age <- NULL
tick_reports$gender <- NULL
tick_reports$host <- NULL
tick_reports$pickup <- NULL
tick_reports$uuid <- NULL
tick_reports$comment <- NULL
tick_reports$deleted <- NULL tick_reports
# some reports are obivously duplicates: same x and y and date (time may vary slighly)
|>
tick_reports group_by(x, y, date) |>
filter(n() > 1) |>
select(id, datetime)
Adding missing grouping variables: `x`, `y`, `date`
# A tibble: 397 × 5
# Groups: x, y, date [176]
x y date id datetime
<int> <int> <date> <int> <dttm>
1 609729 261433 2015-04-13 118 2015-04-13 08:34:29
2 609729 261433 2015-04-13 119 2015-04-13 08:34:29
3 731962 253501 2015-08-26 2028 2015-08-26 20:46:02
4 731962 253501 2015-08-26 2029 2015-08-26 20:46:02
5 621424 264799 2016-07-18 5737 2016-07-18 13:52:11
6 621424 264799 2016-07-18 5738 2016-07-18 13:53:07
7 634791 249092 2016-07-25 5967 2016-07-25 08:18:49
8 634791 249092 2016-07-25 5968 2016-07-25 08:18:49
9 634791 249092 2016-07-25 5969 2016-07-25 08:18:49
10 667235 212674 2016-08-04 6233 2016-08-04 09:34:30
# ℹ 387 more rows
# keep only distinct reports
<- tick_reports |>
tick_reports distinct(x, y, date, .keep_all = TRUE)
$x <- NULL
tick_reports$y <- NULL
tick_reports
<- st_as_sf(tick_reports, coords = c("lon", "lat"), crs = 4326) |>
tick_reports_sf st_transform(2056)
<- cbind(tick_reports_sf, st_coordinates(tick_reports_sf))
tick_reports_sf
<- tick_reports_sf[switzerland, , ]
tick_reports_sf
$year <- year(tick_reports_sf$date)
tick_reports_sf
ggplot(tick_reports_sf) +
geom_sf() +
facet_wrap(~year)
$year <- NULL
tick_reports_sf
<- "data-processed/Ticks"
ticks_path
if(!dir.exists(ticks_path)) {dir.create(ticks_path)}
st_write(tick_reports_sf, file.path(ticks_path, "tick_reports.gpkg"), "reports_0.1", overwrite = TRUE)
Layer reports_0.1 in dataset data-processed/Ticks/tick_reports.gpkg already exists:
use either append=TRUE to append to layer or append=FALSE to overwrite layer
Error in eval(expr, envir, enclos): Dataset already exists.