Appendix A — Paper selection

Systematic literature search was performed in “list” on January 15 and 16, 2024, using search terms to identify eligible articles with the limitations on the publication year: 01/01/2018 to 31/13/2023. The keywords used for the search included “Remote sensing” AND “machine learning” AND “sustainable development goals”” (see table “database_notes.csv”)

Missing DOI were identified using the zotero “no DOI” tag. the DOI was imported either via zotero add on (“manage DOI”) or paper was opened and re-added to zotero via the chrome zotero add on.

These search results were downloaded to a RIS file and imported to Zotero. Zotero’s merge duplicates function was used and after which a csv file was exported.

Code
library("readxl")
notes<- read_excel("database_notes.xlsx")
sum(notes$`Applicable Results`)
Code
all_citations <- read.csv("all_citations_j19.csv",  
                          na.strings=c("","NA")) # make blank cells NA
sum(notes$`Applicable Results`) - nrow(all_citations)

all_citations <- subset(all_citations, 
                        Item.Type == "journalArticle" & 
                          Title != "CALL FOR PAPERS")
# some reports and a call for papers made it in 
sum(notes$`Applicable Results`) - nrow(all_citations)

Retrieving missing abstracts

ref: https://rpubs.com/cschwarz/web_scrape

Code
# missing_index <- which(is.na(all_citations$Abstract.Note))
# not_missing <- all_citations[!is.na(all_citations$Abstract.Note), ]

# retrieve abstract function
grab_info <- function(article){
  if(is.na(article["Abstract.Note"])) {
     article_link <- paste0("https://doi.org/", article["DOI"])
     a_html <- try(read_html(article_link), silent = TRUE)
     if (inherits(a_html, 'try-error')) {
       article["Abstract.Note"] <- NA
       }
     else {
       abstract <- a_html %>% html_node(".hlFld-Abstract .last") %>% html_text()
       article["Abstract.Note"] <- abstract
       }
  }
  return(article)
}

library(parallel)

cl <- makeCluster(detectCores())
clusterExport(cl,varlist = c("grab_info"))
clusterEvalQ(cl,library(rvest))
clusterEvalQ(cl,library(stringr))

library(pbapply)
citations <- as.data.frame(t(pbapply(all_citations, 1, grab_info, cl=cl)))
sum(is.na(citations$Abstract.Note))


# to make searching for words easier and because metagear is case sensitive 
citations$Title <- tolower(citations$Title) 
citations$Abstract.Note <- tolower(citations$Abstract.Note) 

#write.csv(citations, file = "citations_with_reivew.csv")

TO DO: - manually add the missing 6 using zotero

Keywords that indicate rejecting the paper

checking titles first

Code
cit<- read.csv("citations_with_reivew.csv")

to_omit <- cit[which(stringr::str_detect(cit$Title, "review")), ]
to_omit$Title

#After reviewing this titles safe to omit these before metagear screaning 
citations <- cit[which(!stringr::str_detect(cit$Title, "review")), ]
write.csv(citations, file = "citations.csv")

citations <- read.csv("citations.csv")

nrow(cit) - nrow(citations)

Abstract screening

Metagear package

Code
library(metagear)

Keywords

Code
keywords <- c(
    
  # general
  "empirical", "result", "predictive",
  "analysis", "sustainable development goal", 
  "sustainable development",
              
  # data related 
  "remotely sensed", "remote sensing", "satellite", "earth observation", 
  
  # models
  "deep learning", "machine learning", "classification", "classifier",
  "regression", "supervised", "supervized", "test set", "training set",
  " cart ", "svm", " rf ", " ann ", "random forest", "support vector machine", 
  "regression tree", "decision tree", "neural network", "boosting", "bagging", 
  "gradient", "bayes",
  
  # quality metrics
  "overall accuracy", "accuracy", "coefficient of determination", "rmse", "mse",
  "f1", "precision", "auc", " roc ", "recall","sensitivity", "specificity",
  "mean absolute error", "error", "mae",
  
  #to omit 
  "systematic review", "meta-analysis" , "review"
)

Testing Abstract Screening process

Code
citations <- read.csv("citations.csv")

Phase one

Code
cite_testset <- citations[sample(nrow(citations), size=100), ] # set.seed next time!

Random sample of 100 papers was compaired by three reviewers using metagear and the
keywords (from line 144).

Code
metagear_abstract_screening<- function(data, reviewer){
  library(metagear)
  # prime the study‐reference dataset
  theRefs <- effort_initialize(data)
  # here one would = distribute screening effort to a team, in this case that is one person 
  # and save to separate files for each team member
  theRefs_unscreened <- effort_distribute(theRefs, reviewers = reviewer, 
                                        effort = 100, save_split = TRUE)
}


# the following was run for each reviewer 
# eample nina 
metagear_abstract_screening(citations, "nina")

# initialize screener GUI
abstract_screener("effort_nina.csv", aReviewer = "nina", 
                  abstractColumnName = "Abstract.Note", 
                                        titleColumnName = "Title",
                                        highlightKeywords = keywords)

Result

Code
nina_1<- read.csv("reviewer_screaning/effort_nina.csv")
jonas_1<- read.csv("reviewer_screaning/effort_jonas.csv")
joep_1<- read.csv("reviewer_screaning/effort_joep.csv")
ftable(nina_1$INCLUDE, jonas_1$INCLUDE, joep_1$INCLUDE,  dnn = c("nina", "jonas", "joep"))
            joep maybe NO YES
nina  jonas                  
maybe maybe          0  1   0
      NO             1  1   0
      YES            2  0   1
NO    maybe          0  3   1
      NO             0 29   0
      YES            1  7   2
YES   maybe          0  0   1
      NO             1  3   1
      YES            5  4  36

All agreed on YES 36% and NO 29%

After reviewing cases that contradicted, points learnt: - not all the papers included use remote sensing data, these were difficult to categories and might require opening the full paper. Therefore, the next trail will be:

  • phase 2a: screening for empirical research, rather than papers reviewing or discussing methods.

  • phase 2b: from the papers that should be included, we will then assess them more carefully: remote sensing data, machine learning.

Phase 2A

Code
# new random sample: excluding all reviewed in phase 1 
not_in_sample1<- subset(citations, !(DOI %in% nina$DOI))
set.seed(123)
sample2 <- not_in_sample1[sample(nrow(not_in_sample1), size=100), ]
Code
# the following was run for each reviewer 
# example nina 
metagear_abstract_screening(sample2, "nina_2a")

# initialize screener GUI
abstract_screener("effort_nina_2a.csv", aReviewer = "nina_2a", 
                  abstractColumnName = "Abstract.Note", 
                                        titleColumnName = "Title",
                                        highlightKeywords = keywords)

Screening abstracts of references

Instructions 1. run keywords line 362

Code
metagear_abstract_screening<- function(data, reviewer){
  library(metagear)
  # prime the study‐reference dataset
  theRefs <- effort_initialize(data)
  # here one would = distribute screening effort to a team, in this case that is one person 
  # and save to separate files for each team member
  theRefs_unscreened <- effort_distribute(theRefs, reviewers = reviewer, 
                                        effort = 100, save_split = TRUE)
}
  1. run the applicable chunk below
  • choose between relevance categories: Yes, maybe, no
  • once you are done reviewing save (bottom-right: if you don’t save none of the changes are recorded)

For Jonas

Code
# Stage 1 

# initialize screener GUI
abstract_screener("effort_jonas_2a.csv", aReviewer = "jonas_2a", 
                  abstractColumnName = "Abstract.Note", 
                                        titleColumnName = "Title",
                                        highlightKeywords = keywords)
Code
keywords_new <- c(keywords, 
                  # data related 
                  "remotely sensed", "remote sensing", "satellite", "earth observation", 
                  " rs ", "images", "imagery", "sentinel", "landsat", "openstreetmap", 
                  "google earth engine", "true color", "true colour", "false color",
                  "false colour", "rgb", "resolution"
                  ) 
Code
# Stage 2
# after revieing run:
s1_review <- read.csv("effort_jonas_2a.csv")
rev <- subset(s1_review, s1_review$INCLUDE != "NO")


metagear_abstract_screening(rev, "jonas_2b")

# initialize screener GUI
abstract_screener("effort_jonas_2b.csv", aReviewer = "jonas_2b", 
                  abstractColumnName = "Abstract.Note", 
                                        titleColumnName = "Title",
                                        highlightKeywords = keywords_new)

Compare

Code
nina_2a<- read.csv("reviewer_screaning/effort_nina_2a.csv")
jonas_2a<- read.csv("reviewer_screaning/effort_jonas_2a.csv")
joep_2a<- read.csv("reviewer_screaning/effort_joep_2a.csv")

ftable(nina_2a$INCLUDE, jonas_2a$INCLUDE, joep_2a$INCLUDE,  dnn = c("nina", "jonas", "joep"))
           joep NO YES
nina jonas            
NO   NO         44   1
     YES         1   1
YES  NO          2   1
     YES         5  45

Stage 1: agreed: 89%

Step 2

Code
jonas_2b <- read.csv("reviewer_screaning/effort_jonas_2b.csv")

nina_2b <- read.csv("reviewer_screaning/effort_nina_2b.csv")

joep_2b <- read.csv("reviewer_screaning/effort_joep_2b.csv")
Code
all<- joep_2a|>
  dplyr::select(INCLUDE, Title, Abstract.Note, Publication.Year , DOI)


all2<- left_join(all, subset(joep_2b, select = c(INCLUDE, DOI)), 
                 by = "DOI", suffix =c("_joep_a","_joep"))

all2<- left_join(all2, subset(jonas_2b, select = c(INCLUDE, DOI)), 
                 by = "DOI")
colnames(all2)[colnames(all2) == "INCLUDE"] <- "INCLUDE_jonas"

all2<- left_join(all2, subset(nina_2b, select = c(INCLUDE, DOI)), 
                 by = "DOI")
colnames(all2)[colnames(all2) == "INCLUDE"] <- "INCLUDE_nina"

all2[which(is.na(all2$INCLUDE_joep) & is.na(all2$INCLUDE_jonas) & is.na(all2$INCLUDE_nina)), 6:8] <- "NO"
Code
ftable(all2$INCLUDE_nina, all2$INCLUDE_jonas, all2$INCLUDE_joep,  dnn = c("nina", "jonas", "joep"))

67% over all agreement 21% YES that we agree at least one yes:

Code
sum(all2$INCLUDE_joep == "YES"| all2$INCLUDE_nina == "YES"| all2$INCLUDE_jonas == "YES", na.rm = T)

sum(all2$INCLUDE_joep == "YES",  na.rm = T)
sum(all2$INCLUDE_nina == "YES",  na.rm = T)
sum(all2$INCLUDE_jonas == "YES",  na.rm = T)

Adding phase 1 for total agreement

Code
all3<- nina_1|>
  dplyr::select(INCLUDE, Title, Abstract.Note, Publication.Year ,DOI)


all3<- left_join(all3, subset(jonas_1, select = c(INCLUDE, DOI)), 
                 by = "DOI")
colnames(all3)[colnames(all3) == "INCLUDE.x"] <- "INCLUDE_nina"
colnames(all3)[colnames(all3) == "INCLUDE.y"] <- "INCLUDE_jonas"

all3<- left_join(all3, subset(joep_1, select = c(INCLUDE, DOI)), 
                 by = "DOI")
colnames(all3)[colnames(all3) == "INCLUDE"] <- "INCLUDE_joep"
Code
all_reviewed <- rbind(all3, all2[, -1])

ftable_result<- ftable(all_reviewed$INCLUDE_nina, all_reviewed$INCLUDE_jonas, all_reviewed$INCLUDE_joep,  dnn = c("nina", "jonas", "joep"))

all_agreed_yes <- subset(all_reviewed, 
                         subset = (INCLUDE_joep == "YES"&
                                  INCLUDE_nina == "YES"&
                                    INCLUDE_jonas == "YES"))
#write.csv(all_agreed_yes, file = "all_agreed_yes.csv")

ftable_df <- as.data.frame(ftable_result)

sum(ftable_df$Freq[ftable_df$nina == "YES"])
sum(ftable_df$Freq[ftable_df$jonas == "YES"])

sum(ftable_df$Freq[ftable_df$joep == "YES"])
Code
all_agreed_yes <- read.csv("reviewer_screaning/all_agreed_yes.csv")
#metagear_abstract_screening(all_agreed_yes, "clas_v_reg_nina")

abstract_screener("effort_nina.csv", aReviewer = "clas_v_reg_nina", 
                  abstractColumnName = "Abstract.Note", 
                  titleColumnName = "Title",
                  theButtons = c("class", "reg", "unknown"),
                  highlightKeywords = keywords)

current <- read.csv("effort_clas_v_reg_nina.csv")

sum(current$INCLUDE == "class")