Project 1: Exploratory Data Analysis

April 6, 2021

What data did I use?

# Data
library(tidyverse)
library(readr)
state_population <- read_csv("https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-population.csv")

key <- read_csv("https://raw.githubusercontent.com/jasonong/List-of-US-States/master/states.csv")

good <- full_join(key, state_population, by = c(Abbreviation = "state/region"))

state_areas <- read_csv("https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-areas.csv")

# Joining Datasets

MERGED <- full_join(good, state_areas, by = c(State = "state"))

# To join my two datasets, I used the dplyr join function
# 'full join'. I chose to use full join because I didn't want
# any rows dropped and for all of the original data to be
# retained.

# Introduction

## I was looking for two datasets that had a common variable
## of states. I was searching the internet and #I found one
## dataset titled state_population, which contains
## state/region, ages, and within that column there is ages
## under 18 and total, years, and lastly population numbers.
## Then, I wanted to find another dataset that also had the
## states variable and I found the dataset titled state_areas.
## This dataset has states in the first column and area in
## square miles in the second column. I thought this would be
## interesting to see if theres any associations between the
## area of a state and the number of people in that state that
## are under 18 or older. I definitely think the higher the
## area of a state, the higher the population in that state
## will be.

# Tidying

tidy <- good %>% pivot_wider(names_from = "ages", values_from = "population")
tidy %>% pivot_longer(c("under18", "total"), names_to = "ages", 
    values_to = "population")

## # A tibble: 2,544 x 5
##    State   Abbreviation  year ages    population
##    <chr>   <chr>        <dbl> <chr>        <dbl>
##  1 Alabama AL            2012 under18    1117489
##  2 Alabama AL            2012 total      4817528
##  3 Alabama AL            2010 under18    1130966
##  4 Alabama AL            2010 total      4785570
##  5 Alabama AL            2011 under18    1125763
##  6 Alabama AL            2011 total      4801627
##  7 Alabama AL            2009 under18    1134192
##  8 Alabama AL            2009 total      4757938
##  9 Alabama AL            2013 under18    1111481
## 10 Alabama AL            2013 total      4833722
## # … with 2,534 more rows

tidy2 <- state_areas %>% pivot_wider(names_from = "state", values_from = "area (sq. mi)")
tidy2 %>% pivot_longer(everything(), names_to = "state")

## # A tibble: 52 x 2
##    state        value
##    <chr>        <dbl>
##  1 Alabama      52423
##  2 Alaska      656425
##  3 Arizona     114006
##  4 Arkansas     53182
##  5 California  163707
##  6 Colorado    104100
##  7 Connecticut   5544
##  8 Delaware      1954
##  9 Florida      65758
## 10 Georgia      59441
## # … with 42 more rows

# On each dataset, I first used pivot wider to make them
# untidy. Then, I used pivot longer to tidy them again, where
# each observation has its own row and every variable has its
# own column.

# Wrangling

# Population variable

mutate(MERGED, new = population/year)

## # A tibble: 2,545 x 7
##    State   Abbreviation ages     year population `area (sq. mi)`   new
##    <chr>   <chr>        <chr>   <dbl>      <dbl>           <dbl> <dbl>
##  1 Alabama AL           under18  2012    1117489           52423  555.
##  2 Alabama AL           total    2012    4817528           52423 2394.
##  3 Alabama AL           under18  2010    1130966           52423  563.
##  4 Alabama AL           total    2010    4785570           52423 2381.
##  5 Alabama AL           under18  2011    1125763           52423  560.
##  6 Alabama AL           total    2011    4801627           52423 2388.
##  7 Alabama AL           total    2009    4757938           52423 2368.
##  8 Alabama AL           under18  2009    1134192           52423  565.
##  9 Alabama AL           under18  2013    1111481           52423  552.
## 10 Alabama AL           total    2013    4833722           52423 2401.
## # … with 2,535 more rows

MERGED %>% mutate(meanp = mean(population, na.rm = T))

## # A tibble: 2,545 x 7
##    State   Abbreviation ages     year population `area (sq. mi)`    meanp
##    <chr>   <chr>        <chr>   <dbl>      <dbl>           <dbl>    <dbl>
##  1 Alabama AL           under18  2012    1117489           52423 6805558.
##  2 Alabama AL           total    2012    4817528           52423 6805558.
##  3 Alabama AL           under18  2010    1130966           52423 6805558.
##  4 Alabama AL           total    2010    4785570           52423 6805558.
##  5 Alabama AL           under18  2011    1125763           52423 6805558.
##  6 Alabama AL           total    2011    4801627           52423 6805558.
##  7 Alabama AL           total    2009    4757938           52423 6805558.
##  8 Alabama AL           under18  2009    1134192           52423 6805558.
##  9 Alabama AL           under18  2013    1111481           52423 6805558.
## 10 Alabama AL           total    2013    4833722           52423 6805558.
## # … with 2,535 more rows

MERGED %>% summarise(meanpopulaton = mean(population, na.rm = T))

## # A tibble: 1 x 1
##   meanpopulaton
##           <dbl>
## 1      6805558.

MERGED %>% filter(population < mean(population, na.rm = T))

## # A tibble: 2,168 x 6
##    State   Abbreviation ages     year population `area (sq. mi)`
##    <chr>   <chr>        <chr>   <dbl>      <dbl>           <dbl>
##  1 Alabama AL           under18  2012    1117489           52423
##  2 Alabama AL           total    2012    4817528           52423
##  3 Alabama AL           under18  2010    1130966           52423
##  4 Alabama AL           total    2010    4785570           52423
##  5 Alabama AL           under18  2011    1125763           52423
##  6 Alabama AL           total    2011    4801627           52423
##  7 Alabama AL           total    2009    4757938           52423
##  8 Alabama AL           under18  2009    1134192           52423
##  9 Alabama AL           under18  2013    1111481           52423
## 10 Alabama AL           total    2013    4833722           52423
## # … with 2,158 more rows

MERGED %>% filter(population > mean(population, na.rm = T))

## # A tibble: 356 x 6
##    State      Abbreviation ages     year population `area (sq. mi)`
##    <chr>      <chr>        <chr>   <dbl>      <dbl>           <dbl>
##  1 California CA           under18  2012    9209007          163707
##  2 California CA           total    2012   37999878          163707
##  3 California CA           under18  2011    9252336          163707
##  4 California CA           total    2011   37668681          163707
##  5 California CA           under18  2010    9284094          163707
##  6 California CA           total    2010   37333601          163707
##  7 California CA           under18  2013    9174877          163707
##  8 California CA           total    2013   38332521          163707
##  9 California CA           total    2009   36961229          163707
## 10 California CA           under18  2009    9294501          163707
## # … with 346 more rows

MERGED %>% filter(population < mean(population, na.rm = T)) %>% 
    select(State)

## # A tibble: 2,168 x 1
##    State  
##    <chr>  
##  1 Alabama
##  2 Alabama
##  3 Alabama
##  4 Alabama
##  5 Alabama
##  6 Alabama
##  7 Alabama
##  8 Alabama
##  9 Alabama
## 10 Alabama
## # … with 2,158 more rows

MERGED %>% filter(population > mean(population, na.rm = T)) %>% 
    select(State)

## # A tibble: 356 x 1
##    State     
##    <chr>     
##  1 California
##  2 California
##  3 California
##  4 California
##  5 California
##  6 California
##  7 California
##  8 California
##  9 California
## 10 California
## # … with 346 more rows

MERGED %>% summarise(sdpop = sd(population, na.rm = T))

## # A tibble: 1 x 1
##       sdpop
##       <dbl>
## 1 28550145.

MERGED %>% filter(population < sd(population, na.rm = T))

## # A tibble: 2,452 x 6
##    State   Abbreviation ages     year population `area (sq. mi)`
##    <chr>   <chr>        <chr>   <dbl>      <dbl>           <dbl>
##  1 Alabama AL           under18  2012    1117489           52423
##  2 Alabama AL           total    2012    4817528           52423
##  3 Alabama AL           under18  2010    1130966           52423
##  4 Alabama AL           total    2010    4785570           52423
##  5 Alabama AL           under18  2011    1125763           52423
##  6 Alabama AL           total    2011    4801627           52423
##  7 Alabama AL           total    2009    4757938           52423
##  8 Alabama AL           under18  2009    1134192           52423
##  9 Alabama AL           under18  2013    1111481           52423
## 10 Alabama AL           total    2013    4833722           52423
## # … with 2,442 more rows

MERGED %>% filter(population > sd(population, na.rm = T))

## # A tibble: 72 x 6
##    State      Abbreviation ages   year population `area (sq. mi)`
##    <chr>      <chr>        <chr> <dbl>      <dbl>           <dbl>
##  1 California CA           total  2012   37999878          163707
##  2 California CA           total  2011   37668681          163707
##  3 California CA           total  2010   37333601          163707
##  4 California CA           total  2013   38332521          163707
##  5 California CA           total  2009   36961229          163707
##  6 California CA           total  2007   36250311          163707
##  7 California CA           total  2008   36604337          163707
##  8 California CA           total  2005   35827943          163707
##  9 California CA           total  2006   36021202          163707
## 10 California CA           total  2003   35253159          163707
## # … with 62 more rows

MERGED %>% filter(population < sd(population, na.rm = T)) %>% 
    select(State)

## # A tibble: 2,452 x 1
##    State  
##    <chr>  
##  1 Alabama
##  2 Alabama
##  3 Alabama
##  4 Alabama
##  5 Alabama
##  6 Alabama
##  7 Alabama
##  8 Alabama
##  9 Alabama
## 10 Alabama
## # … with 2,442 more rows

MERGED %>% filter(population > sd(population, na.rm = T)) %>% 
    select(State)

## # A tibble: 72 x 1
##    State     
##    <chr>     
##  1 California
##  2 California
##  3 California
##  4 California
##  5 California
##  6 California
##  7 California
##  8 California
##  9 California
## 10 California
## # … with 62 more rows

MERGED %>% summarise(minpop = min(population, na.rm = T))

## # A tibble: 1 x 1
##   minpop
##    <dbl>
## 1 101309

MERGED %>% filter(population < min(population, na.rm = T))

## # A tibble: 0 x 6
## # … with 6 variables: State <chr>, Abbreviation <chr>, ages <chr>, year <dbl>,
## #   population <dbl>, `area (sq. mi)` <dbl>

MERGED %>% filter(population > min(population, na.rm = T))

## # A tibble: 2,523 x 6
##    State   Abbreviation ages     year population `area (sq. mi)`
##    <chr>   <chr>        <chr>   <dbl>      <dbl>           <dbl>
##  1 Alabama AL           under18  2012    1117489           52423
##  2 Alabama AL           total    2012    4817528           52423
##  3 Alabama AL           under18  2010    1130966           52423
##  4 Alabama AL           total    2010    4785570           52423
##  5 Alabama AL           under18  2011    1125763           52423
##  6 Alabama AL           total    2011    4801627           52423
##  7 Alabama AL           total    2009    4757938           52423
##  8 Alabama AL           under18  2009    1134192           52423
##  9 Alabama AL           under18  2013    1111481           52423
## 10 Alabama AL           total    2013    4833722           52423
## # … with 2,513 more rows

MERGED %>% filter(population < min(population, na.rm = T)) %>% 
    select(State)

## # A tibble: 0 x 1
## # … with 1 variable: State <chr>

MERGED %>% filter(population > min(population, na.rm = T)) %>% 
    select(State)

## # A tibble: 2,523 x 1
##    State  
##    <chr>  
##  1 Alabama
##  2 Alabama
##  3 Alabama
##  4 Alabama
##  5 Alabama
##  6 Alabama
##  7 Alabama
##  8 Alabama
##  9 Alabama
## 10 Alabama
## # … with 2,513 more rows

MERGED %>% summarise(maxpop = max(population, na.rm = T))

## # A tibble: 1 x 1
##      maxpop
##       <dbl>
## 1 316128839

MERGED %>% filter(population < max(population, na.rm = T))

## # A tibble: 2,523 x 6
##    State   Abbreviation ages     year population `area (sq. mi)`
##    <chr>   <chr>        <chr>   <dbl>      <dbl>           <dbl>
##  1 Alabama AL           under18  2012    1117489           52423
##  2 Alabama AL           total    2012    4817528           52423
##  3 Alabama AL           under18  2010    1130966           52423
##  4 Alabama AL           total    2010    4785570           52423
##  5 Alabama AL           under18  2011    1125763           52423
##  6 Alabama AL           total    2011    4801627           52423
##  7 Alabama AL           total    2009    4757938           52423
##  8 Alabama AL           under18  2009    1134192           52423
##  9 Alabama AL           under18  2013    1111481           52423
## 10 Alabama AL           total    2013    4833722           52423
## # … with 2,513 more rows

MERGED %>% filter(population > max(population, na.rm = T))

## # A tibble: 0 x 6
## # … with 6 variables: State <chr>, Abbreviation <chr>, ages <chr>, year <dbl>,
## #   population <dbl>, `area (sq. mi)` <dbl>

MERGED %>% filter(population < max(population, na.rm = T)) %>% 
    select(State)

## # A tibble: 2,523 x 1
##    State  
##    <chr>  
##  1 Alabama
##  2 Alabama
##  3 Alabama
##  4 Alabama
##  5 Alabama
##  6 Alabama
##  7 Alabama
##  8 Alabama
##  9 Alabama
## 10 Alabama
## # … with 2,513 more rows

MERGED %>% filter(population > max(population, na.rm = T)) %>% 
    select(State)

## # A tibble: 0 x 1
## # … with 1 variable: State <chr>

MERGED %>% summarise(distpop = n_distinct(population, na.rm = T))

## # A tibble: 1 x 1
##   distpop
##     <int>
## 1    2524

HI <- MERGED %>% group_by(State) %>% summarise(meanpopulation = mean(population, 
    na.rm = TRUE), sdpop = sd(population, na.rm = TRUE), minpop = min(population, 
    na.rm = T), maxpop = max(population, na.rm = T)) %>% view()

MERGED %>% arrange(population) %>% head()

## # A tibble: 6 x 6
##   State                Abbreviation ages     year population `area (sq. mi)`
##   <chr>                <chr>        <chr>   <dbl>      <dbl>           <dbl>
## 1 District of Columbia DC           under18  2010     101309              68
## 2 District of Columbia DC           under18  2009     102098              68
## 3 District of Columbia DC           under18  2008     102257              68
## 4 District of Columbia DC           under18  2011     103906              68
## 5 District of Columbia DC           under18  2007     104126              68
## 6 District of Columbia DC           under18  2006     105651              68

MERGED %>% arrange("area (sq. mi)") %>% head()

## # A tibble: 6 x 6
##   State   Abbreviation ages     year population `area (sq. mi)`
##   <chr>   <chr>        <chr>   <dbl>      <dbl>           <dbl>
## 1 Alabama AL           under18  2012    1117489           52423
## 2 Alabama AL           total    2012    4817528           52423
## 3 Alabama AL           under18  2010    1130966           52423
## 4 Alabama AL           total    2010    4785570           52423
## 5 Alabama AL           under18  2011    1125763           52423
## 6 Alabama AL           total    2011    4801627           52423

MERGED %>% arrange(year) %>% head()

## # A tibble: 6 x 6
##   State   Abbreviation ages     year population `area (sq. mi)`
##   <chr>   <chr>        <chr>   <dbl>      <dbl>           <dbl>
## 1 Alabama AL           under18  1990    1050041           52423
## 2 Alabama AL           total    1990    4050055           52423
## 3 Alaska  AK           total    1990     553290          656425
## 4 Alaska  AK           under18  1990     177502          656425
## 5 Arizona AZ           under18  1990    1006040          114006
## 6 Arizona AZ           total    1990    3684097          114006

# Area (sq. mi)

MERGED %>% group_by(State) %>% summarise(mean = mean("area (sq. mi)"), 
    n = n())

## # A tibble: 53 x 3
##    State                 mean     n
##    <chr>                <dbl> <int>
##  1 Alabama                 NA    48
##  2 Alaska                  NA    48
##  3 Arizona                 NA    48
##  4 Arkansas                NA    48
##  5 California              NA    48
##  6 Colorado                NA    48
##  7 Connecticut             NA    48
##  8 Delaware                NA    48
##  9 District of Columbia    NA    48
## 10 Florida                 NA    48
## # … with 43 more rows

mean(MERGED$"area (sq. mi)", na.rm = T)

## [1] 74223.74

sd(MERGED$"area (sq. mi)", na.rm = T)

## [1] 94921.03

var(MERGED$"area (sq. mi)", na.rm = T)

## [1] 9010001415

n_distinct(MERGED$"area (sq. mi)", na.rm = T)

## [1] 52

min(MERGED$"area (sq. mi)", na.rm = T)

## [1] 68

max(MERGED$"area (sq. mi)", na.rm = T)

## [1] 656425

# Year

MERGED %>% summarise(meanyear = mean(year, na.rm = T))

## # A tibble: 1 x 1
##   meanyear
##      <dbl>
## 1    2002.

MERGED %>% summarise(sdyear = sd(year, na.rm = T))

## # A tibble: 1 x 1
##   sdyear
##    <dbl>
## 1   6.92

MERGED %>% summarise(minyear = min(year, na.rm = T))

## # A tibble: 1 x 1
##   minyear
##     <dbl>
## 1    1990

MERGED %>% summarise(maxyear = max(year, na.rm = T))

## # A tibble: 1 x 1
##   maxyear
##     <dbl>
## 1    2013

MERGED %>% summarise(varyear = var(year, na.rm = T))

## # A tibble: 1 x 1
##   varyear
##     <dbl>
## 1    47.9

mean(MERGED$year, na.rm = T)

## [1] 2001.5

sd(MERGED$year, na.rm = T)

## [1] 6.923547

var(MERGED$year, na.rm = T)

## [1] 47.93551

n_distinct(MERGED$year, na.rm = T)

## [1] 24

min(MERGED$year, na.rm = T)

## [1] 1990

max(MERGED$year, na.rm = T)

## [1] 2013

HI2 <- MERGED %>% group_by(State) %>% summarise(meanyear = mean(year, 
    na.rm = T), sdyear = sd(year, na.rm = T), minyear = min(year, 
    na.rm = T), maxyear = max(year, na.rm = T)) %>% view()

HI3 <- MERGED %>% group_by(State, ages) %>% summarise(meanyear = mean(year, 
    na.rm = T), sdyear = sd(year, na.rm = T), minyear = min(year, 
    na.rm = T), maxyear = max(year, na.rm = T)) %>% view()

library(purrr)
MERGED %>% split(MERGED$ages) %>% map(summary)

## $total
##     State           Abbreviation           ages                year     
##  Length:1272        Length:1272        Length:1272        Min.   :1990  
##  Class :character   Class :character   Class :character   1st Qu.:1996  
##  Mode  :character   Mode  :character   Mode  :character   Median :2002  
##                                                           Mean   :2002  
##                                                           3rd Qu.:2007  
##                                                           Max.   :2013  
##                                                                         
##    population        area (sq. mi)   
##  Min.   :   453690   Min.   :    68  
##  1st Qu.:  1573433   1st Qu.: 35387  
##  Median :  3855132   Median : 56276  
##  Mean   : 10881611   Mean   : 74253  
##  3rd Qu.:  6587580   3rd Qu.: 84904  
##  Max.   :316128839   Max.   :656425  
##  NA's   :10          NA's   :48      
## 
## $under18
##     State           Abbreviation           ages                year     
##  Length:1272        Length:1272        Length:1272        Min.   :1990  
##  Class :character   Class :character   Class :character   1st Qu.:1996  
##  Mode  :character   Mode  :character   Mode  :character   Median :2002  
##                                                           Mean   :2002  
##                                                           3rd Qu.:2007  
##                                                           Max.   :2013  
##                                                                         
##    population       area (sq. mi)   
##  Min.   :  101309   Min.   :    68  
##  1st Qu.:  390643   1st Qu.: 35387  
##  Median :  996839   Median : 56276  
##  Mean   : 2729506   Mean   : 74253  
##  3rd Qu.: 1609652   3rd Qu.: 84904  
##  Max.   :74134167   Max.   :656425  
##  NA's   :10         NA's   :48

# For each variable I attempted to use as many different ways
# as I could to find various summary statistics. I also tried
# to have at least 5 different summary statistics. For
# population, I found the mean, the standard deviation,
# minimum, maximum, and the number of distinct populations
# there were. I did this in multiple ways, using filter,
# summarise, group_by, arrange, etc. Then, I found the same
# summary statistics of the area, in square miles, variable.
# Lastly, I found the same for year but I again used several
# different methods. I used summarise alone and also with
# group_by.

# Visualizing

cormat <- MERGED %>% select_if(is.numeric) %>% cor(use = "pair")

tidycor <- cormat %>% as.data.frame %>% rownames_to_column("var1") %>% 
    pivot_longer(-1, names_to = "var2", values_to = "correlation")
tidycor %>% ggplot(aes(var1, var2, fill = correlation)) + geom_tile() + 
    scale_fill_gradient2(low = "yellow", high = "red") + theme(axis.text.x = element_text(angle = 90, 
    hjust = 1)) + geom_text(aes(label = round(correlation, 2))) + 
    coord_fixed()

# This graph has different colors on it to show if there's
# any patterns in correlation values for multiple variables.
# The darker the red, the more of a correalation the two
# variables had. My graph, however, showed only strong
# correlations between the variable against itself. Of course
# a variable has a strong correlation with itself. No other
# variables, however had strong correlations with one
# another.

ggplot(MERGED, aes(x = year, y = population)) + geom_bar(stat = "summary", 
    fun = mean) + geom_errorbar(stat = "summary", fun.data = mean_se) + 
    ggtitle("Population per year")

ggplot() + geom_point(data = MERGED, aes(x = year, y = population, 
    color = State)) + ggtitle("Population per year of each state")

# This graph showed that for each state, as the years
# increased, population numbers also increased for the most
# part. The higher population numbers on my graph were region
# values. The rest were all pretty close to one another.

library(tidyverse)
ggplot() + geom_point(data = MERGED, aes(x = ages, y = population, 
    color = State)) + ggtitle("Ages of the population in each state")

ggplot(MERGED, aes(x = ages, y = population, fill = State)) + 
    geom_bar(stat = "summary", fun = mean) + geom_errorbar(stat = "summary", 
    fun.data = mean_se) + ggtitle("Ages of the population in each state") + 
    scale_y_continuous(limits = c(0, 5e+08))

# This graph showed for each state, ages which consisted of
# two categories, total and under 18, and population numbers.
# It showed that there was a greater number of people that
# were over the age of 18, than under 18.

# Dimensionality Reduction

MERGED2 <- na.omit(MERGED)
mypr <- prcomp(MERGED2[4:6], scale = TRUE)
mypr

## Standard deviations (1, .., p=3):
## [1] 1.0509421 1.0000000 0.9463195
## 
## Rotation (n x k) = (3 x 3):
##                      PC1           PC2        PC3
## year          -0.3002202  9.053926e-01 -0.3002202
## population    -0.7071068  2.308190e-14  0.7071068
## area (sq. mi) -0.6402092 -4.245755e-01 -0.6402092

summary(mypr)

## Importance of components:
##                           PC1    PC2    PC3
## Standard deviation     1.0509 1.0000 0.9463
## Proportion of Variance 0.3682 0.3333 0.2985
## Cumulative Proportion  0.3682 0.7015 1.0000

plot(mypr, type = "l")

biplot(mypr, scale = 0)

str(mypr)

## List of 5
##  $ sdev    : num [1:3] 1.051 1 0.946
##  $ rotation: num [1:3, 1:3] -3.00e-01 -7.07e-01 -6.40e-01 9.05e-01 2.31e-14 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:3] "year" "population" "area (sq. mi)"
##   .. ..$ : chr [1:3] "PC1" "PC2" "PC3"
##  $ center  : Named num [1:3] 2002 3494881 74253
##   ..- attr(*, "names")= chr [1:3] "year" "population" "area (sq. mi)"
##  $ scale   : Named num [1:3] 6.92 5.01e+06 9.49e+04
##   ..- attr(*, "names")= chr [1:3] "year" "population" "area (sq. mi)"
##  $ x       : num [1:2448, 1:3] 0.0274 -0.4947 0.1122 -0.4035 0.0696 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:3] "PC1" "PC2" "PC3"
##  - attr(*, "class")= chr "prcomp"

mypr$x

##                   PC1          PC2           PC3
##    [1,]  0.0273739652  1.470708398 -6.435333e-01
##    [2,] -0.4947070642  1.470708398 -1.214523e-01
##    [3,]  0.1121960553  1.209170342 -5.549080e-01
##    [4,] -0.4034740278  1.209170342 -3.923787e-02
##    [5,]  0.0695683489  1.339939370 -5.990040e-01
##    [6,] -0.4491015519  1.339939370 -8.033406e-02
##    [7,] -0.3562132539  1.078401314  2.250713e-04
##    [8,]  0.1551027193  1.078401314 -5.110909e-01
##    [9,] -0.0151401545  1.601477426 -6.877429e-01
##   [10,] -0.5403539193  1.601477426 -1.625291e-01
##   [11,] -0.2574820841  0.816863258  7.494133e-02
##   [12,]  0.2420939626  0.816863258 -4.246347e-01
##   [13,] -0.3072451519  0.947632286  3.798068e-02
##   [14,]  0.1983608672  0.947632286 -4.676253e-01
##   [15,] -0.1562199768  0.555325201  1.471267e-01
##   [16,]  0.3309436537  0.555325201 -3.400370e-01
##   [17,] -0.2079316562  0.686094230  1.121146e-01
##   [18,]  0.2862315961  0.686094230 -3.820486e-01
##   [19,] -0.1073444374  0.424556173  1.849748e-01
##   [20,]  0.3748088202  0.424556173 -2.971784e-01
##   [21,] -0.0601392574  0.293787145  2.244934e-01
##   [22,]  0.4182523754  0.293787145 -2.538983e-01
##   [23,]  0.0316439337  0.032249089  3.061576e-01
##   [24,]  0.5039423808  0.032249089 -1.661408e-01
##   [25,] -0.0134753428  0.163018117  2.645532e-01
##   [26,]  0.4611193899  0.163018117 -2.100416e-01
##   [27,]  0.5905422086 -0.229288967 -7.929324e-02
##   [28,]  0.1236579669 -0.229288967  3.875910e-01
##   [29,]  0.0771873614 -0.098519939  3.473379e-01
##   [30,]  0.5470412250 -0.098519939 -1.225160e-01
##   [31,]  0.1706094464 -0.360057995  4.273632e-01
##   [32,]  0.6343323090 -0.360057995 -3.635963e-02
##   [33,]  0.6770393145 -0.490827023  7.657082e-03
##  [ reached getOption("max.print") -- omitted 2415 rows ]

MERGED <- na.omit(MERGED)
MERGED2 <- cbind(MERGED, mypr$x[, 1:2])

head(MERGED2)

##     State Abbreviation    ages year population area (sq. mi)         PC1
## 1 Alabama           AL under18 2012    1117489         52423  0.02737397
## 2 Alabama           AL   total 2012    4817528         52423 -0.49470706
## 3 Alabama           AL under18 2010    1130966         52423  0.11219606
## 4 Alabama           AL   total 2010    4785570         52423 -0.40347403
## 5 Alabama           AL under18 2011    1125763         52423  0.06956835
## 6 Alabama           AL   total 2011    4801627         52423 -0.44910155
##        PC2
## 1 1.470708
## 2 1.470708
## 3 1.209170
## 4 1.209170
## 5 1.339939
## 6 1.339939

library(ggplot2)

ggplot(MERGED2, aes(PC1, PC2, col = Abbreviation, fill = Abbreviation)) + 
    stat_ellipse(geom = "polygon", col = "black", alpha = 0.5) + 
    geom_point(shape = 21, col = "black")

cor(MERGED[4:6], MERGED2[7:8])

##                      PC1           PC2
## year          -0.3155140  9.053926e-01
## population    -0.7431283  2.266122e-14
## area (sq. mi) -0.6728228 -4.245755e-01

# This showed that the correlation between population and PC3
# was the highest. Then, there were negative correlations
# between year and PC1, population and PC1, which actually
# had a high negative correlation, area and PC1, area and
# PC2, year and PC3, and area and PC3. When the correlation
# is high and positive it means that as the value for an
# observsation increases along the principal component, so
# does for example the population because it had a value of
# 0.7. With a negative correaltion, as one increases, the
# other decreases. If the value is small, that indicates
# there's no real correlation, as shown with the PC2.

## paste this chunk into the ```{r setup} chunk at the top of
## your project 1 .Rmd file

knitr::opts_chunk$set(echo = TRUE, eval = TRUE, fig.align = "center", 
    warning = F, message = F, tidy = TRUE, tidy.opts = list(width.cutoff = 60), 
    R.options = list(max.print = 100))

…

Project 1: Exploratory Data Analysis

April 6, 2021

Related Posts