source("../appendix/packages.R")
# Manualy extracted features from poteniatly relevant papers
<- read_excel("data/extracted_features.xlsx", sheet = "Include", na = "NA")
extracted_features <- extracted_features |> mutate(DOI = tolower(DOI))
extracted_features
# Citation data from zeotero
<- read.csv("data/citations.csv")
citations <- citations |> mutate(DOI = tolower(DOI))
citations
# Citation number data from OpenAlex
<- read.csv("data/OpenAlex.csv")
open_alex <- open_alex[, c("DOI", "globalCitationsCount")]
open_alex <- open_alex |> mutate(DOI = tolower(DOI))
open_alex
# For study label/study ID: merging first author and year
$AuthorYear <- paste(sapply(strsplit(citations$Author, ", "),
citations`[`, 1), "et al.",
$Publication.Year)
citations
# Join the publication details with the extracted features
<- subset(citations, DOI %in% extracted_features$DOI,
citations_needed select = c(DOI, AuthorYear,
Publication.Title,
Publication.Year))
<- inner_join(citations_needed, extracted_features, by = "DOI")
dat
# Add global citation number from OpenAlex
<- left_join(dat, open_alex, by = "DOI")
dat
# Study features: factors
<- c("SDG_theme", "classification_type", "model_group", "ancillary",
cols_to_factor "indices", "RS_device_type", "RS_devices", "RS_device_group",
"RS_spectral_bands_no", "RS_spatital_resolution_m",
"Confusion_matrix")
# Select and clean the final dataset for analysis
<- subset(dat, !is.na(total), # Omit studies without totals
my_data select = c("DOI", "AuthorYear", "Publication.Year",
"globalCitationsCount", "location", cols_to_factor,
"OA_reported", "number_classes",
"fraction_majority_class", "total"))
# Regroup the extracted features: at least 5 for each group
$model_group <- ifelse((my_data$model_group == "Neural Networks" |
my_data$model_group == "Tree-Based Models"),
my_data$model_group, "Other")
my_data
$model_group <- factor(my_data$model_group,
my_datalevels = c("Neural Networks", "Tree-Based Models", "Other"))
## Group the number of bands (low, mid, not reported)
$no_band_group <- with(my_data,
my_dataifelse(RS_spectral_bands_no == "Not Reported",
"Not Reported",
ifelse(RS_spectral_bands_no %in% c(1, 4, 5),
"Low",
ifelse(RS_spectral_bands_no %in%
c(7, 8, 9, 10, 11, 13, 14),
"Mid", NA)))
)
# Group remote sensing spatial resolution
$RS_spatital_res_grouped <- ifelse(my_data$RS_spatital_resolution_m < 1,
my_data"<1 metre",
ifelse(my_data$RS_spatital_resolution_m >= 10 &
$RS_spatital_resolution_m <= 30,
my_data"10-30 metres",
$RS_spatital_resolution_m))
my_data
# ## maybe this is better:
# my_data$RS_spatital_res_grouped <- ifelse(my_data$RS_spatital_resolution_m !=
# "Not Reported", "Reported",
# my_data$RS_spatital_resolution_m)
# Reorder RS_device_group
$RS_device_group <- factor(my_data$RS_device_group,
my_datalevels = c("Sentinel", "Landsat",
"Other", "Not Reported"))
# SDG
$SDG_theme <- factor(my_data$SDG_theme, levels = c("SDG2: Zero Hunger",
my_data"SDG11: Sustainable Cities",
"SDG15: Life on Land"))
# Label for ancillary
$ancillary <- factor(my_data$ancillary,
my_datalevels = c(0, 1),
labels = c("Remote Sensing Only", "Ancillary Data Included"))
# Label for indices
$indices <- factor(my_data$indices,
my_datalevels = c(0, 1),
labels = c("Not Used", "Used"))
$Confusion_matrix <- factor(my_data$Confusion_matrix,
my_datalevels = c(0, 1),
labels = c("Not Reported", "Reported"))
# Estimate ID (esid) based on each study (AuthorYear)
<- my_data |> group_by(AuthorYear) |> mutate(esid = row_number())
my_data
# Event (s_ij) variable for analysis of proportions
$event <- my_data$total * my_data$OA_reported
my_data
# Save the final dataset for analysis
write.csv(my_data, "../data/analysis_df.csv")
Appendix B — Data wrangling
The following code shows how I combined the data from different sources and grouped the variables. For the final that see Github repository.
The following assesses whether any categorical variables in the dataset had values that are unique to a single study. First, the relevant categorical variables, including features like remote sensing device type and spatial resolution are selected. A function is defined to group each variable by its values and count the number of distinct papers associated with each value. The tables show the number of papers, effect sizes, and highlights the specific study name if that category is only represented by a single source. The number of effect sizes is always greater than 5, however there are a few instances that only one study contributed to a category.
<- c("SDG_theme", "classification_type", "model_group", "ancillary",
categorical_cols "indices", "RS_device_group", "RS_devices", "RS_device_type",
"RS_device_group", "no_band_group",
"RS_spatital_res_grouped", "Confusion_matrix")
<- function(df, var_name) {
check_single_study %>%
df group_by_at(var_name) %>%
summarise(unique_studies = n_distinct(AuthorYear)) %>%
filter(unique_studies == 1) %>%
summarise(total_entries = n()) %>%
pull(total_entries) > 0
}
<- categorical_cols[sapply(categorical_cols,
single_study_vars function(v) check_single_study(my_data, v))]
<- function(df, var_name) {
count_studies_effect_sizes_and_study %>%
df group_by_at(var_name) %>%
summarise(count_papers = n_distinct(AuthorYear), # Count distinct papers
count_effect_sizes = n(), # Count total number of effect sizes
study = ifelse(count_papers == 1, first(AuthorYear), NA)) %>% #name if count is 1
arrange(desc(count_papers))
}
# Loop through the categorical variables and count the number of papers,
## effect sizes, study name given if unique
<- list()
count_values_list for (var in single_study_vars) {
<- count_studies_effect_sizes_and_study(my_data, var)
count_values <- count_values
count_values_list[[var]]
}
for (var in names(count_values_list)) {
print(count_values_list[[var]])
}
# A tibble: 4 × 4
RS_device_group count_papers count_effect_sizes study
<fct> <int> <int> <chr>
1 Sentinel 9 20 <NA>
2 Landsat 8 15 <NA>
3 Other 4 44 <NA>
4 Not Reported 1 7 Jochem et al. 2018
# A tibble: 2 × 4
RS_devices count_papers count_effect_sizes study
<chr> <int> <int> <chr>
1 satellite 19 79 <NA>
2 aerial photographic images 1 7 Shen et al. 2023
# A tibble: 4 × 4
RS_device_type count_papers count_effect_sizes study
<chr> <int> <int> <chr>
1 Passive 15 61 <NA>
2 Combined 4 7 <NA>
3 Active 3 11 <NA>
4 Not Reported 1 7 Jochem et al. 2018
# A tibble: 3 × 4
RS_spatital_res_grouped count_papers count_effect_sizes study
<chr> <int> <int> <chr>
1 10-30 metres 16 39 <NA>
2 Not Reported 4 40 <NA>
3 <1 metre 1 7 Shen et al. 2023