Load Libraries

# Load required packages for data manipulation and visualization
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.1     ✔ readr     2.1.6
## ✔ ggplot2   4.0.1     ✔ stringr   1.6.0
## ✔ lubridate 1.9.4     ✔ tibble    3.3.0
## ✔ purrr     1.2.0     ✔ tidyr     1.3.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Load Data from Github and Append Data

github_url <- "https://github.com/FundamentalsSid601/FinalHW-Ex1/raw/main/data/"

# List of all Excel file names to read
file_names <- c("fsi-2006.xlsx", "fsi-2007.xlsx", "fsi-2008.xlsx", "fsi-2009.xlsx",
                "fsi-2010.xlsx", "fsi-2011.xlsx", "fsi-2012.xlsx", "fsi-2013.xlsx",
                "fsi-2014.xlsx", "fsi-2015.xlsx", "fsi-2016.xlsx", "fsi-2017.xlsx",
                "fsi-2018.xlsx", "fsi-2019.xlsx", "fsi-2020.xlsx", "fsi-2021.xlsx",
                "fsi-2022-download.xlsx", "FSI-2023-DOWNLOAD.xlsx")

# Loop through each file, download from GitHub, read it, and combine all into one dataframe
all_data <- map_df(file_names, function(file) {
  url <- paste0(github_url, file)              # Create full URL
  temp_file <- tempfile(fileext = ".xlsx")     # Create temporary file
  download.file(url, temp_file, mode = "wb", quiet = TRUE)  # Download file
  df <- read_excel(temp_file)                  # Read Excel file into dataframe
  df <- df %>% mutate(Year = as.numeric(Year)) # Convert Year column to numeric
  return(df)
})

Inspect Data

# View first 6 rows of data
head(all_data)
## # A tibble: 6 × 17
##   Country         Year Rank  Total C1: Security Apparat…¹ C2: Factionalized El…²
##   <chr>          <dbl> <chr> <dbl>                  <dbl>                  <dbl>
## 1 Sudan         1.14e9 1st    112.                    9.8                    9.1
## 2 Congo Democr… 1.14e9 2nd    110.                    9.8                    9.6
## 3 Cote d'Ivoire 1.14e9 3rd    109.                    9.8                    9.8
## 4 Iraq          1.14e9 4th    109                     9.8                    9.7
## 5 Zimbabwe      1.14e9 5th    109.                    9.4                    8.5
## 6 Chad          1.14e9 6th    106.                    9.4                    9.5
## # ℹ abbreviated names: ¹​`C1: Security Apparatus`, ²​`C2: Factionalized Elites`
## # ℹ 11 more variables: `C3: Group Grievance` <dbl>, `E1: Economy` <dbl>,
## #   `E2: Economic Inequality` <dbl>, `E3: Human Flight and Brain Drain` <dbl>,
## #   `P1: State Legitimacy` <dbl>, `P2: Public Services` <dbl>,
## #   `P3: Human Rights` <dbl>, `S1: Demographic Pressures` <dbl>,
## #   `S2: Refugees and IDPs` <dbl>, `X1: External Intervention` <dbl>,
## #   `Change from Previous Year` <dbl>
# View all column names
colnames(all_data)
##  [1] "Country"                          "Year"                            
##  [3] "Rank"                             "Total"                           
##  [5] "C1: Security Apparatus"           "C2: Factionalized Elites"        
##  [7] "C3: Group Grievance"              "E1: Economy"                     
##  [9] "E2: Economic Inequality"          "E3: Human Flight and Brain Drain"
## [11] "P1: State Legitimacy"             "P2: Public Services"             
## [13] "P3: Human Rights"                 "S1: Demographic Pressures"       
## [15] "S2: Refugees and IDPs"            "X1: External Intervention"       
## [17] "Change from Previous Year"

Keep Only Required Data

# Select only the 16 required columns and rename them to use underscores instead of colons/spaces
all_data <- all_data %>% select(Country, Year, Rank, Total,
                                 "C1: Security Apparatus", "C2: Factionalized Elites",
                                 "C3: Group Grievance", "E1: Economy",
                                 "E2: Economic Inequality", "E3: Human Flight and Brain Drain",
                                 "P1: State Legitimacy", "P2: Public Services",
                                 "P3: Human Rights", "S1: Demographic Pressures",
                                 "S2: Refugees and IDPs", "X1: External Intervention") %>%
  rename(C1_Security_Apparatus = "C1: Security Apparatus",
         C2_Factionalized_Elites = "C2: Factionalized Elites",
         C3_Group_Grievance = "C3: Group Grievance",
         E1_Economy = "E1: Economy",
         E2_Economic_Inequality = "E2: Economic Inequality",
         E3_Human_Flight_and_Brain_Drain = "E3: Human Flight and Brain Drain",
         P1_State_Legitimacy = "P1: State Legitimacy",
         P2_Public_Services = "P2: Public Services",
         P3_Human_Rights = "P3: Human Rights",
         S1_Demographic_Pressures = "S1: Demographic Pressures",
         S2_Refugees_and_IDPs = "S2: Refugees and IDPs",
         X1_External_Intervention = "X1: External Intervention")

Clean and Format Data

# Remove non-numeric characters from Rank (e.g., "1st" becomes 1) and convert to number
all_data <- all_data %>% mutate(Rank = as.numeric(gsub("[^0-9]", "", Rank)))
# Convert Year to integer
all_data <- all_data %>% mutate(Year = as.integer(Year))
# Convert Year from Unix timestamp to actual year (some files store Year as seconds since 1970)
all_data <- all_data %>% mutate(Year = lubridate::year(as.POSIXct(Year, origin = "1970-01-01", tz = "UTC")))
# Check unique years to verify conversion
unique(all_data$Year)
##  [1] 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020
## [16] 1970 2022
# Fix any 1970 values (these are from 2023 data where Year was stored as 0)
all_data <- all_data %>% mutate(Year = if_else(Year == 1970, 2023, Year))
# Ensure Year is stored as integer
all_data <- all_data %>% mutate(Year = as.integer(Year))
# Verify final data structure and values
head(all_data)
## # A tibble: 6 × 16
##   Country          Year  Rank Total C1_Security_Apparatus C2_Factionalized_Eli…¹
##   <chr>           <int> <dbl> <dbl>                 <dbl>                  <dbl>
## 1 Sudan            2006     1  112.                   9.8                    9.1
## 2 Congo Democrat…  2006     2  110.                   9.8                    9.6
## 3 Cote d'Ivoire    2006     3  109.                   9.8                    9.8
## 4 Iraq             2006     4  109                    9.8                    9.7
## 5 Zimbabwe         2006     5  109.                   9.4                    8.5
## 6 Chad             2006     6  106.                   9.4                    9.5
## # ℹ abbreviated name: ¹​C2_Factionalized_Elites
## # ℹ 10 more variables: C3_Group_Grievance <dbl>, E1_Economy <dbl>,
## #   E2_Economic_Inequality <dbl>, E3_Human_Flight_and_Brain_Drain <dbl>,
## #   P1_State_Legitimacy <dbl>, P2_Public_Services <dbl>, P3_Human_Rights <dbl>,
## #   S1_Demographic_Pressures <dbl>, S2_Refugees_and_IDPs <dbl>,
## #   X1_External_Intervention <dbl>
str(all_data)
## tibble [3,170 × 16] (S3: tbl_df/tbl/data.frame)
##  $ Country                        : chr [1:3170] "Sudan" "Congo Democratic Republic" "Cote d'Ivoire" "Iraq" ...
##  $ Year                           : int [1:3170] 2006 2006 2006 2006 2006 2006 2006 2006 2006 2006 ...
##  $ Rank                           : num [1:3170] 1 2 3 4 5 6 6 8 9 10 ...
##  $ Total                          : num [1:3170] 112 110 109 109 109 ...
##  $ C1_Security_Apparatus          : num [1:3170] 9.8 9.8 9.8 9.8 9.4 9.4 10 9.4 9.1 8.2 ...
##  $ C2_Factionalized_Elites        : num [1:3170] 9.1 9.6 9.8 9.7 8.5 9.5 9.8 9.6 9.1 8 ...
##  $ C3_Group_Grievance             : num [1:3170] 9.7 9.1 9.8 9.8 8.5 8.5 8 8.8 8.6 9.1 ...
##  $ E1_Economy                     : num [1:3170] 7.5 8.1 9 8.2 9.8 7.9 8.5 8.4 7 7.5 ...
##  $ E2_Economic_Inequality         : num [1:3170] 9.2 9 8 8.7 9.2 9 7.5 8.3 8.9 8 ...
##  $ E3_Human_Flight_and_Brain_Drain: num [1:3170] 9.1 8 8.5 9.1 9 8 7 8 8.1 7 ...
##  $ P1_State_Legitimacy            : num [1:3170] 9.5 9 10 8.5 8.9 9.5 10 9.4 8.5 8.3 ...
##  $ P2_Public_Services             : num [1:3170] 9.5 9 8.5 8.3 9.5 9 10 9.3 7.5 8 ...
##  $ P3_Human_Rights                : num [1:3170] 9.8 9.5 9.4 9.7 9.5 9.1 9.5 9.6 8.5 8.2 ...
##  $ S1_Demographic_Pressures       : num [1:3170] 9.6 9.5 8.8 8.9 9.7 9 9 8.8 9.3 7.9 ...
##  $ S2_Refugees_and_IDPs           : num [1:3170] 9.7 9.5 7.6 8.3 8.9 9 8.1 5 9.3 9.6 ...
##  $ X1_External_Intervention       : num [1:3170] 9.8 10 10 10 8 8 8.5 10 9.2 10 ...
unique(all_data$Year)
##  [1] 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020
## [16] 2023 2022

Boxplot: Total Score by Year

# Create boxplot showing distribution of Total FSI score for each year
ggplot(all_data, aes(x = factor(Year), y = Total)) +
  geom_boxplot(fill = "steelblue") +
  labs(title = "Distribution of Fragile States Index Total Score by Year",
       subtitle = "Higher scores indicate greater state fragility",
       x = "Year",
       y = "Total Score",
       caption = "Source: Fragile States Index Data (2006-2023)") +
  theme_minimal()

Histograms: Cohesion Indicators (2013 vs 2023)

# Filter for 2013 and 2023, reshape data to long format, and create faceted histograms
all_data %>% filter(Year %in% c(2013, 2023)) %>%
  pivot_longer(cols = c(C1_Security_Apparatus, C2_Factionalized_Elites, C3_Group_Grievance),
               names_to = "Variable", values_to = "Value") %>%
  ggplot(aes(x = Value, fill = factor(Year))) +
  geom_histogram(bins = 15, alpha = 0.7, position = "identity") +
  facet_wrap(~Variable, scales = "free", ncol = 1) +
  labs(title = "Comparison of Cohesion Indicators: 2013 vs 2023",
       subtitle = "Higher scores indicate greater fragility",
       x = "Score",
       y = "Number of Countries",
       fill = "Year",
       caption = "Source: Fragile States Index Data") +
  theme_minimal()