When possible I’ll breakdown each step by its function to help explain the process, as we gain knowledge, I’ll start to condense the code and speed things up.

FUN WITHIN LAPPLY

In this example we’ll work our way to show how to create our own function and use it in one of the apply functions.

head(flags)
dim(flags)
[1] 194  30
viewinfo()   #gives us a file of more details and information
class(flags)
[1] "data.frame"
# _________but we want to know the class of each column, we can use a loop or we can use lapply
cls_list <- lapply(flags, class)
$name
[1] "character"

$landmass
[1] "integer"

$zone
[1] "integer"
as.character(cls_list)
 [1] "character" "integer"   "integer"   "integer"   "integer"   "integer"   "integer"   "integer"   "integer"   "integer"   "integer"  
[12] "integer"   "integer"   "integer"   "integer"   "integer"   "integer"   "character" "integer"   "integer"   "integer"   "integer"  
[23] "integer"   "integer"   "integer"   "integer"   "integer"   "integer"   "character" "character"
 sum(flags$orange)   #___to sum how many times orange is in a country's flag
[1] 26
flag_colors <- flags[, 11:17]     #_____to extract columns 11-17
sapply(flag_colors, sum)          #_____to sum each column in the subset
   red  green   blue   gold  white  black orange 
   153     91     99     91    146     52     26 
sapply(flag_colors,mean)
      red     green      blue      gold     white     black    orange 
0.7886598 0.4690722 0.5103093 0.4690722 0.7525773 0.2680412 0.1340206

flag_shapes <- flags[, 19:23] #_____extract columns
shape_mat <- sapply(flag_shapes,range)
     circles crosses saltires quarters sunstars
[1,]       0       0        0        0        0
[2,]       4       2        1        4       50
class(shape_mat)
[1] "matrix" "array"  

SAPPLY

sapply(flags, class)
       name    landmass        zone        area  population    language    religion        bars     stripes     colours         red 
"character"   "integer"   "integer"   "integer"   "integer"   "integer"   "integer"   "integer"   "integer"   "integer"   "integer" 
      green        blue        gold       white       black      orange     mainhue     circles     crosses    saltires    quarters 
  "integer"   "integer"   "integer"   "integer"   "integer"   "integer" "character"   "integer"   "integer"   "integer"   "integer" 
   sunstars    crescent    triangle        icon     animate        text     topleft    botright 
  "integer"   "integer"   "integer"   "integer"   "integer"   "integer" "character" "character"

MEAN OF COL

Here is a simple way of calculating the mean of a column

  • Read csv into a df
  • The column we want to calculate the mean of is “Ozone”
  • Select the wanted column “Ozone” and assign it to df “oz”
  • Omit all NAs from “oz” and assign it back to “oz”
  • Calculate the mean of the column

read.csv

mean

hw1<- read.csv("hw1_data.csv")

oz <- hw1 |>     # we have 153 rows
  select(Ozone)

oz <-  na.omit(oz)     # exclude rows with Ozone = NA gives us 116 rows
mean(oz$Ozone)
  #42.12937

SPLIT DF

Aside from using group_by we can use split to subset a df

library(datasets)
head(airquality)
s <- split(airquality,airquality$Month)
$`5`
   Ozone Solar.R Wind Temp Month Day
1     41     190  7.4   67     5   1
2     36     118  8.0   72     5   2
3     12     149 12.6   74     5   3
4     18     313 11.5   62     5   4..
..$`6`
   Ozone Solar.R Wind Temp Month Day
32    NA     286  8.6   78     6   1
33    NA     287  9.7   74     6   2
34    NA     242 16.1   67     6   3
35    NA     186  9.2   84     6   4
36    NA     220  8.6   85     6   5

MEAN OF COLUMNS

mean of all cols

#__________So let's calculate the mean of all columns 
lapply(s, colMeans)
$`5`
   Ozone  Solar.R     Wind     Temp    Month      Day 
      NA       NA 11.62258 65.54839  5.00000 16.00000 

$`6`
    Ozone   Solar.R      Wind      Temp     Month       Day 
       NA 190.16667  10.26667  79.10000   6.00000  15.50000 

$`7`
     Ozone    Solar.R       Wind       Temp      Month        Day 
        NA 216.483871   8.941935  83.903226   7.000000  16.000000 

$`8`
    Ozone   Solar.R      Wind      Temp     Month       Day 
       NA        NA  8.793548 83.967742  8.000000 16.000000 

$`9`
   Ozone  Solar.R     Wind     Temp    Month      Day 
      NA 167.4333  10.1800  76.9000   9.0000  15.5000

mean of 1 col

#__________what if we want certain column
lapply(s, function(santa){ mean(santa[ ,"Wind])})
$`5`
[1] 11.62258

$`6`
[1] 10.26667

$`7`
[1] 8.941935

$`8`
[1] 8.793548

$`9`
[1] 10.18

mean of several columns

#__________For multiple columns I have to use colMeans instead of mean()
lapply(s, function(santa){ colMeans(santa[ ,c("Wind","Temp")])})
$`5`
    Wind     Temp 
11.62258 65.54839 

$`6`
    Wind     Temp 
10.26667 79.10000 

$`7`
     Wind      Temp 
 8.941935 83.903226 

$`8`
     Wind      Temp 
 8.793548 83.967742 

$`9`
 Wind  Temp 
10.18 76.90 

sapply

sapply(s, function(santa){ colMeans(santa[ ,c("Ozone","Wind","Temp")])})
             5        6         7         8     9
Ozone       NA       NA        NA        NA    NA
Wind  11.62258 10.26667  8.941935  8.793548 10.18
Temp  65.54839 79.10000 83.903226 83.967742 76.90

na.rm

sapply(s, function(santa){ colMeans(santa[ ,c("Ozone","Wind","Temp")], na.rm = TRUE)})
             5        6         7         8        9
Ozone 23.61538 29.44444 59.115385 59.961538 31.44828
Wind  11.62258 10.26667  8.941935  8.793548 10.18000
Temp  65.54839 79.10000 83.903226 83.967742 76.90000

MEAN  FUNCTION

Scan through a directory for files
Read csv files
Remove NA
Calculate mean of columns
Note: a row might have a value in one column but not the other

install.packages("data.table")
library(data.table)
pollutantmean <- function(directory, pollutant, id = 1:332){
        fileDir <- setwd("D:/Education/R/Data/specdata/")                        
        fileList = list.files(directory)
        wantedFiles = file.path(fileDir,directory,fileList[id],fsep="/" )       
        dataIn <-  lapply(wantedFiles, read.csv)
        bound_dataIn = do.call(rbind,dataIn)
        selec_col = na.omit(bound_dataIn[pollutant])
        mean(selec_col[,pollutant])
}
pollutantmean("specdata","sulfate", 1:10)
[1] 4.064128
pollutantmean("specdata","nitrate", 70:72)
[1] 1.706047
pollutantmean("specdata","nitrate", 23)
[1] 1.280833
pollutantmean("specdata", "sulfate", 34)
[1] 1.477143
pollutantmean("specdata", "nitrate")
[1] 1.702932
pollutantmean("specdata","nitrate", 30:23)
[1] 4.284494

Hypothesis

  • Need to create a function that calculates the mean of some data
  • Function will accept three variables, “directory” , “pollutant” and ID, ID already has a default value
  • Data is to be downloaded and stored locally on a drive
  • Data is a set of pollutant measurements at different sensors
  • Rows will contain NA values
  • File name is the same as the sensor ID
  • No duplicate sensor IDs, hence no duplicate file names
  • Files are .csv format
  • We need to find the mean of each pollutant at each sensor ID
  • Data contains two columns for two pollutants “nitrate” & “sulfate”
  • One col has the date of the reading, and the other column has the sensor ID which matches with the file name
  • Code can be found at W2_Ass…#1-3.R

setwd

  • The first line we set the working directory where the data is located. Note one of the arguments you are asked to pass to the function is the directory where the files are located, so account for that.
  • Assign the working directory to fileDir

list.files

  • We assign a list of all the files in the directory to fileList, view it, and confirm all files are there and nothing unusual is present.
  • Let’s take the argument “directory” that was entered by the user and either paste it to the fileDir to make up a file path for each file we want to read, the newer faster function is file.path() so I’ll
  • use file.path()

file.path

  • As the name describes, it creates a file.path to the file
  • Note that we use fileList[id] which is a subset of all the files contained in fileList
  • How do we know which file we need, well the user passes an argument ID, which could be a single digit: 1, or it could be a sequence: 34:298, it could even be a reverse sequence: 30:25
  • Assign the path to: wantedFiles

.csv

  • Now that we have a file.path to the files we want to read we’ll use lapply to scan through fileList[id] one by one and read the .csv file

lapply

  • Use lapply to scan through fileList[id] one by one and read the .csv file
  • Assign all the data to dataIn

do.call

  • Add all the files we read binding horizontally (rbind)
  • Remember some rows will have NA values

So far here is what a sample of the output looks like:

pollutantmean <- function(directory, pollutant, id = 1:332){
        fileDir <- setwd("D:/Education/R/Data/specdata/")                        
        fileList = list.files(directory)
        wantedFiles = file.path(fileDir,directory,fileList[id],fsep="/" )       
        dataIn <-  lapply(wantedFiles, read.csv)
        bound_dataIn = do.call(rbind,dataIn)
 bound_dataIn
Date sulfate nitrate ID
1   2003-01-01      NA      NA  1
2   2003-01-02      NA      NA  1
3   2003-01-03      NA      NA  1
4   2003-01-04      NA      NA  1
5   2003-01-05      NA      NA  1
6   2003-01-06      NA      NA  1
7   2003-01-07      NA      NA  1
8   2003-01-08      NA      NA  1

na.omit

  • We need to remove all rows that have NA values
  • Be careful here, since the aim is to calculate the mean of each column separately, the same row can have a missing value in one column while not the other column
  • So evaluate each column independently of the other column
  • Save the cleaned rows into selec_col

mean

  • Now that we have a bound df of all the data that has been cleaned of all NA values
  • Calculate the mean for the pollutant column specified by the user
  • mean(selec_col[, pollutant]) means take all rows for the specified pollutant column

COUNT COMPLETE CASES

This is a continuation to the Mean Function: pollutantmeanfunction(). But let’s go back a touch and

User will input TWO arguments: “directory” and “id”
Instead of calculating the mean() of each pollutant (column), we want to know how many readings were taken at each ID/Sensor
That means we need to omit rows with missing values for all pollutants
In other words, every cell in the row has to have a value or it should be omitted
Output the count of observations for each location/ID/sensor along with the ID

Here we can use the majority of the code used in the Mean Function pollutantmeanfunction() with a couple of minor changes, up to right after we rbind the files together, here is the block we’ll reuse:

complete <-  function(directory, id = 1:332){
        fileDir <- setwd("D:/Education/R/Data/specdata/")                        
        fileList = list.files(directory)
        wantedFiles = file.path(fileDir,directory,fileList[id],fsep="/" )       
        dataIn <-  lapply(wantedFiles, read.csv)
        bound_dataIn <-   do.call(rbind,dataIn) 
        #that's the same code from pollutantmeanfunction() after editing the arguments and name
        # we create a df of the rows that are complete.cases which is checked on each row across all columns
        ok <- bound_dataIn[complete.cases(bound_dataIn), ]
        
        #count() counts the unique valaues of one or more variables, it creates a column (n) in df
        total <- ok |> 
                count(ID, name ="nobs") 
        
        return(bound_dataIn)
}
complete("specdata",1)
  ID nobs
1  1  117
complete("specdata",1:3)
  ID nobs
1  1  117
2  2 1041
3  3  243
complete("specdata", 30:25)
  ID nobs
1 25  463
2 26  586
3 27  338
4 28  475
5 29  711
6 30  932
complete("specdata",c(2,4,8,10,12))
  ID nobs
1  2 1041
2  4  474
3  8  192
4 10  148
5 12   96
complete("specdata",3)
  ID nobs
1  3  243

complete.cases

Up to bound_dataIn, all we’ve done is use the code from before, and now we have a long df of data with NA cells. You can see a sample output of bound_dataIn in the above example.

Well why don’t we just use

selec_col = na.omit(bound_dataIn[pollutant])

Well for one, we are not going to operate on each column separately as we did before. We could run na.omit on all columns, here we only have two columns so we can do it, but what if we have 1000 columns? The easier way is to use comple.cases().

Here we want to look for a complete case across all columns so we use and assign it to ok, and now you see what the output looks like. What I notice is that the first complete case is not till row 279 for ID=1, so it eliminated 278 rows right at the start of the first file.

ok <- bound_dataIn[complete.cases(bound_dataIn), ]
           Date sulfate nitrate ID
279  2003-10-06   7.210   0.651  1
285  2003-10-12   5.990   0.428  1
291  2003-10-18   4.680   1.040  1
297  2003-10-24   3.470   0.363  1
303  2003-10-30   2.420   0.507  1
315  2003-11-11   1.430   0.474  1
321  2003-11-17   2.760   0.425  1
327  2003-11-23   3.410   0.964  1
333  2003-11-29   1.300   0.491  1
339  2003-12-05   3.150   0.669  1
345  2003-12-11   2.870   0.400  1
357  2003-12-23   2.270   0.715  1
363  2003-12-29   2.330   0.554  1
369  2004-01-04   1.840   0.803  1
375  2004-01-10   7.130   0.518  1
387  2004-01-22   2.050   1.400  1
393  2004-01-28   2.050   0.979  1

groupby

Here is a look on a redundant explanation using group_by. It is not needed because Count(ID, name=…) automatically counts the occurrence of each ID. The reason I put it in here and not used it is to explain the thinking of what you’re doing. In a sense we are grouping all the IDs together and counting how many times they occur.

        grouped <- ok |> 
                group_by(ok$ID) |> 
                count(ok$ID, name="nobs")

count

Nothing special here, all that’s left to do is count the observations for each ID,  and as specified we need to save it in a new column “nobs” so we need to create a new column and store the count values in it:

        #count() counts the unique values of one or more variables, it creates a column (name="x") in df
        total <- ok |> 
                count(ID, name ="nobs")

COR THRESHOLD

Build a function that accepts a directory, and a threshold.
We’ll use the entire dataset to start with (hint: use whatever you can from the functions we just created)
Calculate the correlation between the two pollutants/columns if that ID/sensor meets the threshold
Ignore all ID/Sensors/Locations that don’t meet the threshold
Eliminate all rows that have any missing values (since we are looking for a correlation between the two) it means both columns have to have values
Save the correlation values for each sensor and provide a summary for each input argument

# Let's initialize some vectors
met_thr <- numeric()
cvalue <- numeric()
corr_vec <- c()

#fetch  all the complete.cases data from previous function complete()
all_data <- complete("specdata",1:332)


corr <- function( directory, threshold = 0){
        
        #extract a list of IDs that meet threshold from function in part 2 results.
         all_data[2] is a column with IDs of stations
        met_thr <- subset(all_data, all_data[2] > threshold)
        
        #Proceed only if at least one station meets the threshold
        if (nrow(met_thr) > 0){
                
                # get list of all files from the directory
                fileDir <- setwd("D:/Education/R/Data/specdata/")
                fileList = list.files(directory)
                
                # extract files that met threshold and save in rawdataIn
                for ( index in met_thr[1]){
                        wantedFiles = file.path(fileDir,directory,fileList[index],fsep="/" )
                        rawdataIn <-  lapply(wantedFiles, read.csv)
                        
                        #loop through the df one station at a time
                        for (ii in 1:length((rawdataIn))){
                                #loop each station file one row at a time skipping NA rows, 
                                 to calculate correlation of col2,3
                                for (v in nrow(rawdataIn[[ii]])){
                                        cvalue <-  cor(rawdataIn[[ii]][2], rawdataIn[[ii]][3], use = "complete.obs")
                                        corr_vec <- c(corr_vec, cvalue)
                                }
                        }
                }
                return(corr_vec)
        }
}
cr <- corr("specdata", 150)
head(cr)
[1] -0.01895754 -0.14051254 -0.04389737 -0.06815956 -0.12350667 -0.07588814
summary(cr)
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-0.21057 -0.04999  0.09463  0.12525  0.26844  0.76313 

cr <- corr("specdata", 400)  
head(cr)
[1] -0.01895754 -0.04389737 -0.06815956 -0.07588814  0.76312884 -0.15782860
summary(cr)
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-0.17623 -0.03109  0.10021  0.13969  0.26849  0.76313 

cr <- corr("specdata", 5000)
summary(cr)
Length  Class   Mode 
     0   NULL   NULL 
length(cr)
[1] 0

cr <- corr("specdata")
summary(cr)
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-1.00000 -0.05282  0.10718  0.13684  0.27831  1.00000 
length(cr)
[1] 323

for loop (nested)

cor

Here you can see the use of nested for loops, and cor() function all together:

   for ( index in met_thr[1]){
                        wantedFiles = file.path(fileDir,directory,fileList[index],fsep="/" )
                        rawdataIn <-  lapply(wantedFiles, read.csv)
                        
                        #loop through the df one station at a time
                        for (ii in 1:length((rawdataIn))){
                                #loop each station file one row at a time skipping NA rows, to calculate correlation of col2,3
                                for (v in nrow(rawdataIn[[ii]])){
                                        cvalue <-  cor(rawdataIn[[ii]][2], rawdataIn[[ii]][3], use = "complete.obs")
                                        corr_vec <- c(corr_vec, cvalue)
                                }
                        }
                }

COMPARE CARE

Mortality Rates

#Compare Care data found in "D:/~/outcome-of-care-measures" & "/hospital-data.csv"
#Let's read one of the files into a df# read.csv("D:/~/outcome-of-care-measures.csv", colClasses = "Character")
# could have been used to set all columns to characters which I am not going to use
outcome <- read.csv("D:~/outcome-of-care-measures.csv")
head(outcome)
dim(outcome)
object.size(outcome)
names(outcome)
summary(outcome)
ls(outcome)
str(outcome)
# the most telling is str() which alerts me that the columns we're about to use are all of 
class='chr'->need to convert to num

hist

Plot a histogram of the 30 day death rates from heart attacks (col #11).

#_______________Let's see a Histogram of the 30-day death rates from heart attack 
#_________review attached pdf as well as names() shows that column is #11
#_________ so to coerce all rows we use [ , 11]
outcome[ ,11] <- as.numeric(outcome[ ,11])
Warning message:
NAs introduced by coercion 
hist(outcome[ ,11])

Best in State

stop

stop(…, call. = TRUE, domain = NULL) takes zero or more objects which can be coerced to character or a single object is the first argument which we’ll focus on.

if x > y  stop("are you crazy?! What are you thinking?")

Assignment

  • Write a function(best) that accepts 2 arguments: 2 char abbreviated state name, and an outcome name; in order to
  • Find the best hospital in each state based on 30-day mortality rate (lowest). Use column 2 –

    Hospital.Name

  • Base it on Column sto 11 –

    Hospital.30.Day.Death..Mortality..Rates.from.Heart.Attack

  • Column 17  –

    Hospital.30.Day.Death..Mortality..Rates.from.Heart.Failure

  • Column 23 –

    Hospital.30.Day.Death..Mortality..Rates.from.Pneumonia

  • Hospitals with no data on that specific outcome should be excluded from that set
  • If a tie exists in that set, then sort in alphabetical order and the first on that list will be the winner
  • If an invalid state is passed, function should throw an error via the stop function “invalid state”. Column 7 –

    State

  • If an invalid outcome is passed to best, function should throw an error via the stop function “invalid outcome”
getfile

Let’s start by creating a function that reads the data in:

#_______________Let's create a function that reads our datain
getfile <- function(file){
        fileDir <- setwd("D:/~/CompareCare/")
        wantedFile = file.path(fileDir,file,fsep="/")
        return(read.csv(wantedFile))
}
datain <- getfile("outcome-of-care-measures.csv")

best

Create the function that solves the assignment:

best <- function(state, outcome) {
        #read data in using the function getfile()
        datain <- getfile("outcome-of-care-measures.csv")
        
        ## Check that state and outcome are valid if not stop and issue a warning
        if (!(state %in% datain[,7])){ stop("invalid state")}
        if (!((outcome == "heart attack") | 
              (outcome == "heart failure") |
              (outcome == "pneumonia"))) {stop("invalid outcome")}
        
        # Create a subset for the only state given by the user
        # we could use stateonly <- filter(datain, State == state) note state is the argument passed into the function
        stateonly <- subset(datain, State == state)
        
        # Assign outcome input to appropriate column so we can filter that out
        if (outcome == "heart attack") {usethis = 11}
        if (outcome == "heart failure") {usethis = 17}
        if (outcome == "pneumonia") {usethis = 23}
       
        # Let's filter/subset the stateonly subset to include Hospital Name, and the outcome given by user
        subdata <- select(stateonly, c("State", "Hospital.Name", all_of(usethis))) 
        
        # "Not Applicable" in the 3rd column is not detected by any of the NA functions so I'll do it manually
        gooddata <-  subset(subdata, subdata[3] != "Not Available")
        
        #rate in df is a char, so unlist it and coerce it to a numeric
        gooddata[3] <- as.numeric(unlist(gooddata[3]))
        
        # Arrange in ascending order of rate=col[3] first and name=col[2] second to break a tie if it exists
        #and assign the row number to the new rank column
        blah <- gooddata |>  
                arrange((gooddata[3]), gooddata[2]) |>
                mutate(rank = row_number())
        #return the hospital name with the lowest rank or row 1 col 2
        blah <- (blah[1,2])
}
b <- best("TX", "heart attack")
[1] "CYPRESS FAIRBANKS MEDICAL CENTER"

b <- best("TX", "heart failure")
[1] "FORT DUNCAN MEDICAL CENTER"

b <- best("MD", "heart attack")
[1] "JOHNS HOPKINS HOSPITAL, THE"

b <- best("MD", "pneumonia")
[1] "GREATER BALTIMORE MEDICAL CENTER"

b <- best("BB", "heart attack")
Error in best("BB", "heart attack") : invalid state

b <- best("NY", "hert attack")
Error in best("NY", "hert attack") : invalid outcome

 

Rank by State

Continuing the theme of 1 & 2, I’ll create a function rankhospital() that:

  • Takes 3 arguments: State, outcome, rank
  • This time we want to pull that specific hospital that meets that rank for that specific outcome
  • Just as before if state or outcome are invalid execute a stop() with message
  • Rank argument accepts “best”, “worst”, or an integer indicating the desired rank, the lower the number the better the rank
  • If rank argument is greater than the ranks for the outcome, return NA
  • As before, a tie is decided by the alphabetical order of hospitals
  • Output is a character vector of the hospital name meeting the condition
getfile
# select() is in tidyverse so load it
library(tidyverse)

#_______________Let's copy the function from before, that reads our datain
getfile <- function(file){
        fileDir <- setwd("D:/Education/R/Data/CompareCare/")
        wantedFile = file.path(fileDir,file,fsep="/")
        return(read.csv(wantedFile))
}

rankhospital

#______________Function to return the hospital with the desired ranking based on outcome given
rankhospital <- function(state, outcome, num = "best") {
        #read data in using the function getfile()
        datain <- getfile("outcome-of-care-measures.csv")
        
        ## Check that state and outcome are valid
        if (!(state %in% datain[,7])){ stop("invalid state")}
        if (!((outcome == "heart attack") |
              (outcome == "heart failure") |
               (outcome == "pneumonia"))) {stop("invalid outcome")}
        
        # Create a subset for the only state given by the user
        # we could use stateonly <- filter(datain, State == state) note state is the argument passed into the function
        stateonly <- subset(datain, State == state)
        
        # Assign outcome to appropriate column so we can filter that out
        if (outcome == "heart attack") {usethis = 11}
        if (outcome == "heart failure") {usethis = 17}
        if (outcome == "pneumonia") {usethis = 23}
        
        # Let's filter the stateonly subset to include Hospital Name, and the outcome given by user
        subdata <- select(stateonly, c("State", "Hospital.Name", all_of(usethis))) 
        
        # Not Applicable in the 3rd column is not detected by any of the NA functions so I'll do it manually
        gooddata <-  subset(subdata, subdata[3] != "Not Available")
        
        #now that we've removed empty rows we can test num input and set variables
        if (num == "best") {num = 1}
        if (num == "worst") {num = nrow(gooddata)}        
        
        #rate in df is a char, so unlist it and coerce it to a numeric
        gooddata[3] <- as.numeric(unlist(gooddata[3]))
        
        # Arrange in ascending order of rate=col[3] first and name=col[2] second to break a tie if it exists
        # Create rank and create new col
        blah <- gooddata |>  
                arrange((gooddata[3]), gooddata[2]) |>
                mutate(rank = row_number())      
        
        #return the hospital name with the desired rank in blah
        if (num > nrow(gooddata)) { blah = "NA"}
                else
                        {blah <- (blah[num,2])}
}
yep <- rankhospital("TX", "heart attack", 2)
[1] "HOUSTON NORTHWEST MEDICAL CENTER"
yep <- rankhospital("TX", "heart attack", 1)
[1] "CYPRESS FAIRBANKS MEDICAL CENTER"
yep <- rankhospital("TX", "heart attack", "best")
[1] "CYPRESS FAIRBANKS MEDICAL CENTER"
yep <- rankhospital("TX", "heart attack", 102)
[1] "LONGVIEW REGIONAL MEDICAL CENTER"
yep <- rankhospital("TX", "heart attack", "worst")
[1] "LAREDO MEDICAL CENTER"
yep <- rankhospital("TX", "heart attack", 2000)
[1] "NA"
yep <- rankhospital("TX", "heart failure", 4)
[1] "DETAR HOSPITAL NAVARRO"
> yep <- rankhospital("TX", "heart attack", "worst")
[1] "LAREDO MEDICAL CENTER"
yep <- rankhospital("MD", "heart attack", "worst")
[1] "HARFORD MEMORIAL HOSPITAL"
yep <- rankhospital("MN", "heart attack", 5000)
[1] "NA"

All States Rank

Assignment

Create a function rankall() that:

  • Takes 2 args: (outcome) and hospital ranking (num)
  • Outcome is identical to what we used before
  • (num)
  • Output is a 2 column df containing the hospital in each state that satisfies the rank=num specified by user
  • First column is named: (hospital) and second is (state) containing 2-char abbreviation
  • As before if a hospital doesn’t have data for that outcome it should be excluded
  • num just as before could be “bet”, “worst”, an integer within the range or out of range of the ranks
  • Some states might have NA values as no hospital might meet the ranking requirements, it’s ok. This part could be tricky if you misread it. To deal with it:
  • I’ll create a list of all states in the dataset
  • Once user enters a num, and the dataset is filtered to that value, I’ll compare this subset to the list of all states and enter NA for any rows in the list of all states that are not in the new subset (filtered) dataset
  • Ranking ties are handled as before
  • Input validity is as before
getfile
# select() is in tidyverse so load it
library(tidyverse)

#_______________Let's copy the function from before, that reads our datain
getfile <- function(file){
        fileDir <- setwd("D:/Education/R/Data/CompareCare/")
        wantedFile = file.path(fileDir,file,fsep="/")
        return(read.csv(wantedFile))
}
datain <- getfile("outcome-of-care-measures.csv")

rankall

#______________Function to return the hospital with the desired ranking based on outcome given
rankall <- function(outcome, num = "best") {
        # check validity of (outcome input)
        if (!((outcome == "heart attack") |
              (outcome == "heart failure") |
              (outcome == "pneumonia"))) {stop("invalid outcome")}
        # Assign outcome to appropriate column so we can filter that out
        if (outcome == "heart attack") {usethis = 11}
        if (outcome == "heart failure") {usethis = 17}
        if (outcome == "pneumonia") {usethis = 23}
        # Let's filter out columns not needed
        subdata <- select(datain, c("State", "Hospital.Name", all_of(usethis))) 
        # Not Applicable in the 3rd column is not detected by any of the NA functions so I'll do it manually
        gooddata <-  subset(subdata, subdata[3] != "Not Available")
        #rate in df is a char, so unlist it and coerce it to a numeric
        gooddata[3] <- as.numeric(unlist(gooddata[3]))
        #__so far it's been the same as the other sections above

        #Let's group by state add count for how many rows are in each state group (found)
        groudata <- gooddata |>
                group_by(gooddata$State) |>
                mutate(found = n())
        #Sort in ascending order, and add (rank) col for each row in the group
        bystate <- groudata |>
                arrange((groudata[3]), groudata[2], .by_group = TRUE) |>
                mutate(rank = row_number())

        # set conditions based on input from user for (num)
        if (num == "best") {num = 1}
        if (num == "worst") {num = bystate$found}  #worst would be the last row in that state group or the count=found
        
        # Once the user specifies num we subset bystate into rows with rank = num (input value)
        rankedout <- subset(bystate, bystate$rank == num)

        #create a df of all rows from the dataset that was grouped by states
        #I could've use bystate but I needed a sperate df that included all states in the dataset
        statesfound <- bystate[,c("State","Hospital.Name","found")]
        
        #I'll take out all duplicates, I basically want just a list of all the states=statesfound and the 
                   count of occurences per group=found
        statesfound <- statesfound[!duplicated(statesfound$State),]
       
        #if user inputs num>found (values) for a state, display NA for that state's Hospital name
        #decided not to use this approach, found an easier one below
        # outlist <- statesfound 
        # outlist$Hospital.Name[outlist$found < num] <- "NA"

        #satesfound has all the states(54 rows), rankedout has the filtered states (< 54rows)
        # merge/join rankedout to statesfound and enter NA in Hospital Name column for rows in left and not in right
        almostlist <- merge(statesfound, rankedout, by= "State", all.x = TRUE)
        #because both df statesfound and rankedout have a col Hospital.Name then the merged(wanted) one is labeled ~.y
        #create a subset df of just the values asked for in the assignment: State & Hospital Name
        outlist <- almostlist[c("State","Hospital.Name.y")]
        outlist <- outlist |> 
                   rename(Hospital = Hospital.Name.y)
        return(outlist)
}
 outlist <- rankall("pneumonia",20)
 head(outlist,10)
   State                                           Hospital
1     AK                                               <NA>
2     AL                             CHILTON MEDICAL CENTER
3     AR         BAPTIST HEALTH MEDICAL CENTER HEBER SPINGS
4     AZ          SCOTTSDALE HEALTHCARE-SHEA MEDICAL CENTER
5     CA FOUNTAIN VALLEY REGIONAL HOSPITAL & MEDICAL CENTER
6     CO                   VALLEY VIEW HOSPITAL ASSOCIATION
7     CT                            MIDSTATE MEDICAL CENTER
8     DC                                               <NA>
9     DE                                               <NA>
10    FL                    KENDALL REGIONAL MEDICAL CENTER
outlist <- rankall("pneumonia","worst")
tail(outlist,3)
   State                                   Hospital
52    WI MAYO CLINIC HEALTH SYSTEM - NORTHLAND, INC
53    WV                     PLATEAU MEDICAL CENTER
54    WY           NORTH BIG HORN HOSPITAL DISTRICT
outlist <- rankall("heart failure")
tail(outlist,10)
   State                                                          Hospital
45    TN                         WELLMONT HAWKINS COUNTY MEMORIAL HOSPITAL
46    TX                                        FORT DUNCAN MEDICAL CENTER
47    UT VA SALT LAKE CITY HEALTHCARE - GEORGE E. WAHLEN VA MEDICAL CENTER
48    VA                                          SENTARA POTOMAC HOSPITAL
49    VI                            GOV JUAN F LUIS HOSPITAL & MEDICAL CTR
50    VT                                              SPRINGFIELD HOSPITAL
51    WA                                         HARBORVIEW MEDICAL CENTER
52    WI                                    AURORA ST LUKES MEDICAL CENTER
53    WV                                         FAIRMONT GENERAL HOSPITAL
54    WY                                        CHEYENNE VA MEDICAL CENTER
r <- rankall("heart attack", 4)
as.character(subset(r, State == "HI")$Hospital)
#[1] "CASTLE MEDICAL CENTER"
r2 <- rankall("pneumonia", "worst")
as.character(subset(r2, State == "NJ")$Hospital)
#[1] "BERGEN REGIONAL MEDICAL CENTER"
r3 <- rankall("heart failure", 10)
as.character(subset(r3, State == "NV")$Hospital)
[1] "RENOWN SOUTH MEADOWS MEDICAL CENTER"

 

 

COMPARECARE

DATA

data input

# select() is in tidyverse so load it
library(tidyverse)

#_______________Let's copy the function from before, that reads our datain
getfile <- function(file){
        fileDir <- setwd("~/CompareCare/")
        wantedFile = file.path(fileDir,file,fsep="/")
        return(read.csv(wantedFile))
}
datain <- getfile("outcome-of-care-measures.csv")

INSPECT

class

Let’s look at the classes of all the columns in the dataset, first let’s see what type of object it is:

class(datain)
[1] "data.frame"
lapply

Let’s use lapply to find out what the class of each column is

class_list <- lapply(datain,class)
class_list   list[46]  List of length 46
$Provider.Number
[1] "character"
$Hospital.Name
[1] "character"
$Address.1
[1].....

PROCESS

List Groups

Let’s keep inspecting the data, we know we have to extract data by state eventually, so how many states/groups are in the dataframe?

There are so many ways to find that out let me see if I can hit on a few of them here, we can start with lapply, we just used it above so let’s start with it.

lapply

extract column

Here is the final statement, but let’s break it down piece by piece:

state_list <- sort(unique(unlist(lapply(datain,function(santa){datain[7]} ))))

I’ll create an anonymous function to extract the seventh column (State column). This will give us a list of 46 dfs one for each column with all the data grouped by datain[7] for all rows, that’s 4706 rows per column the total rows in the entire dataset.  I’ve included a part of the output:

  • lapply always puts out a list
  • each column is grouped according to the santa() the function based on column [7] the state column
  • As you can see, it starts off with AL, well the reason being is that happens to be the first state in the datain dataset.
  • I know there is a state AK which would be the first state after the data is sorted.
  • So be aware that the data is not sorted, just grouped

Here is how it breaks down:

howmanystates <- lapply(datain,function(santa){datain[7]})
str(howmanystates)
List of 46
 $ Provider.Number                                                                      :'data.frame':	4706 obs. of  1 variable:
  ..$ State: chr [1:4706] "AL" "AL" "AL" "AL" ...
 $ Hospital.Name                                                                        :'data.frame':	4706 obs. of  1 variable:
  ..$ State: chr [1:4706] "AL" "AL" "AL" "AL" ...
 $ Address.1                                                                            :'data.frame':	4706 obs. of  1 variable:
  ..$ State: chr [1:4706] "AL" "AL" "AL" "AL" ...
 $ Address.2                                                                            :'data.frame':	4706 obs. of  1 variable:
  ..$ State:
alter function

Instead of extracting State column[7], let’s look for unique values in column[7] using lapply again. Well the results are not close but close!

  • We end up with a df (46,54), that’s 46 columns for the original columns and 54 rows for all the unique State values column[7].
  • The funny part is we start with row numbered 1 with every value in that row being AL
  • Second row is numbered 99, with every value being AK for all 46 columns
  • Third row is numbered 116, with every value being AZ in every column
  • What’s interesting is that the total count of occurrences for AL is 98 as you’ll see down below when we count the occurrence of each state
  • The number for row 3 is 116 which is 17 higher than row 2 which corresponds to
  • The total number of occurences for AK is 17
  • So in a way it does provide us with a count for occurrences for each state in a roundabout way
lapply_list <- lapply(datain, function(ho){unique(datain[7])})
unlist

What happens if we unlist howmanystates from above? Let’s try it. We get a list a list of values xxxx long. What happened is unlist will flatten the previous list of dataframes into one long list and we end up flattening a df that had 46 columns and thousands of rows into one long list of all the values.

howmanystates2 <- unlist(lapply(datain,function(santa){datain[7]}))
length(howmanystates2)
[1] 216476

That’s exactly what we want! Let’s now pull in the unique values for State!

unique

As mentioned above let’s see how many unique states are in the dataset. We get a list of characters 54 elements long.

howmanystates2 <- unique(unlist(lapply(datain,function(santa){datain[7]})))
 [1] "AL" "AK" "AZ" "AR" "CA" "CO" "CT" "DE" "DC" "FL" "GA" "HI" "ID" "IL" "IN" "IA" "KS" "KY" "LA" "ME"
 "MD" "MA" "MI" "MN" "MS" "MO" "MT"
[28] "NE" "NV" "NH" "NJ" "NM" "NY" "NC" "ND" "OH" "OK" "OR" "PA" "PR" "RI" "SC" "SD" "TN" "TX" "UT" "VT"
 "VI" "VA" "WA" "WV" "WI" "WY" "GU"
length(howmanystates2)
[1] 54
sort

As you shown above, the list is not sorted in order so let’s do that with the sort(). Now we have the complete statement –

I changed the name to state_list. We’ll use this later. SAME AS unique_list see below

state_list <- sort(unique(unlist(lapply(datain,function(santa){datain[7]} ))))
 [1] "AK" "AL" "AR" "AZ" "CA" "CO" "CT" "DC" "DE" "FL" "GA" "GU" "HI" "IA" "ID" "IL" "IN" "KS"
 "KY" "LA" "MA" "MD" "ME" "MI" "MN" "MO" "MS"
[28] "MT" "NC" "ND" "NE" "NH" "NJ" "NM" "NV" "NY" "OH" "OK" "OR" "PA" "PR" "RI" "SC" "SD" "TN"
 "TX" "UT" "VA" "VI" "VT" "WA" "WI" "WV" "WY"

split

split() will split the data by State. It will give us a list of 54 dataframes one for each state in the dataset. It is a few steps ahead of using lapply.

Each df is summarized with dimensions of rows and columns which gives us the length of each df, which is the count of rows in each group. The list is sorted in ascending order of State as well.

sgroup <- split(datain,datain$State)

sapply

Instead of using lapply above, we can use sapply, which counts each group of States and we avoid all the other steps. What if we want to pull the length of each out into it’s own list

Now we get count_sgroup which a list of all the states and a column of count/nrow for each state. We used the split data sgroup from above.

count_sgroup <- sapply(sgroup, count)

unique

We could’ve just used unique without lapply() but we still have to sort it. Unique() starts extracting the first element in the State/ column[7] regardless of where in the order it happens to be. It is looking for a unique value in the column not meant to sort the result. So, with sort() we get unique_list,  which is the same as state_list of length 54.

unique_list <- sort(unique(datain$State))
SAME AS state_list see above in lapply/sort

n_distinct

Here is the definition of n_distinct in case you missed it from Process page. NA is irrelevant at this stage.

n_distinct() counts the number of unique/distinct combinations in a set of one or more vectors. It’s a faster and more concise equivalent to nrow(unique(data.frame(…))). Usage: n_distinct(…, na.rm = FALSE)

dis_list <- n_distinct(datain$State)
[1] 54

count

Use this one for many reasons. Will copy down to the section below because it provides answers to both sections. So what does this count() do?

  • It created a df with two columns
  • One column is State
  • Second count is (n) the count of occurrence for each state
  • sort=TRUE will sort the df in (n) ascending order
  • sort=FALSE or omitted will sort the df in ascending order based on State
count_list <- datain |> 
        count(State)
   State   n
1     AK  17
2     AL  98
3     AR  77
4     AZ  77
5     CA 341
6     CO  72
7     CT  32
8     DC   8
9     DE   6
10    FL 180
11    GA 132
12    GU   1
13    HI  19
14    IA 109
15    ID  30
16    IL 179
17    IN 124
18    KS 118
19    KY  96
20    LA 114
21    MA  68
22    MD  45
23    ME  37
24    MI 134
25    MN 133
26    MO 108
27    MS  83
28    MT  54
29    NC 112
30    ND  36
31    NE  90
32    NH  26
33    NJ  65
34    NM  40
35    NV  28
36    NY 185
37    OH 170
38    OK 126
39    OR  59
40    PA 175
41    PR  51
42    RI  12
43    SC  63
44    SD  48
45    TN 116
46    TX 370
47    UT  42
48    VA  87
49    VI   2
50    VT  15
51    WA  88
52    WI 125
53    WV  54
54    WY  29
rename

Rename (n) to howmany by adding the last line

count_list <- datain |> 
         count(State) |> 
        rename( howmany = n)

group_by

Of course, group_by can be used. This chunk of code

  • Creates a df with the rows grouped by states with the first group being for AL because that happens to be the first row in the datain dataset.
  • Number of rows remains the same as the original datain set because all we’ve done is group
  •  A column labeled n displays the count of occurrences for each group/state, repeated for every row in that group till the next group starts
  • So for example AL will have 98 in that column and the next 97 rows will all have 98 till the next group start
  • Also another column that’s the sort column is created, in this case that column is labeled datain$State
  • Continue the next phase down in filter below
groupdata <- datain |>
        group_by(datain$State) |>
        mutate(found = n())

Count per Group

count per state

Let’s see how many entries per state. We could save the information if we need it

table(datain$State)
 AK  AL  AR  AZ  CA  CO  CT  DC  DE  FL  GA  GU  HI  IA  ID  IL  IN  KS  KY  LA  
 17  98  77  77 341  72  32   8   6 180 132   1  19 109  30 179 124 118  96 114 
 MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV 
 68 45 37 134 133 108 83 54 112 36 90 26 65 40 28
 NY  OH  OK  OR  PA  PR  RI  SC  SD  TN  TX  UT  VA  VI  VT  WA  WI  WV  WY 
185 170 126  59 175  51  12  63  48 116 370  42  87   2  15  88 125  54  29 

count & list of groups

This is a very handy way of creating a list as well as the count of each group/state saved in a df. I prefer to use this if I needed a separate list. See List Groups duplicate section explaining this block of code as well as the output for it

count_list <- datain |> 
        count(State)
Observations
  1. 54 states have data
  2. Not all states have the same count which means that some states will not not have values depending on what the user inputs in num=XX. For example if the user enters num=25, then right off the bat we know AK, DC, DE, GU, HI, RI, VI, VT will not have any entries at that ranking and the other 48 MIGHT, I say might
  3. Just because a state has a specific count it doesn’t mean that every column will have a value up to the nrow, so we have to know how to handle those missing values
  4.  All mortality rate columns are of “character” class and we need to output a value related to the rank of each hospital, so we need to coerce these columns to numeric
  5. We have 46 columns and we only need a handful of columns for this function

CLEAN

Arguments

Since we are going to be writing a function, we want to guard against bad input from the user (wrong values as arguments)

  • The function rankall() we are about to create takes 2 arguments
  • outcome – which could be any of these three: heart attack, heart failure, pneumonia
  • num – takes an integer, or “best” or “worst” with best meaning the lowest rating and worst meaning the highest rating
  • If outcome is wrong we want to issue a warning and stop the execution
  • Look in the documentation for more information as to which column corresponds to each outcome.
  • As listed above, the corresponding columns are: 11, 17, 23
  • We already know the State column is column 7
  • Hospital Name column is column 2 or Hospital.Name

Here is how we’ll deal with validating the arguments:

# check validity of (outcome input)
        if (!((outcome == "heart attack") |
               (outcome == "heart failure") |
               (outcome == "pneumonia"))) {stop("invalid outcome")}
# Assign outcome to appropriate column so we can filter that out
        if (outcome == "heart attack") {usethis = 11}
        if (outcome == "heart failure") {usethis = 17}
        if (outcome == "pneumonia") {usethis = 23}

Filter

Let’s filter out the columns we need based on the argument. Based on what I created above we can now use (usethis) as the filter

select

Let’s narrow down the dataset to the needed columns

# Let's filter out columns not needed
        subdata <- select(datain, c("State", "Hospital.Name", all_of(usethis)))

remove na

Many rows have Not Applicable as value, and all na fuctions failed to remove them, so I’ll just remove them manually

gooddata <-  subset(subdata, subdata[3] != "Not Available")

coerce values

Remember that all the rate columns are of type “char”, so instead of coercing the entire dataset, I chose to just work with the column that’s needed based on the input to the function. Since we already created a subset of the 3 columns we extracted and named it gooddata we’ll use that and coerce column 3

gooddata[3] <- as.numeric(unlist(gooddata[3]))

group_by

See group_by section above.

rank

arrange

So far, we’ve

  • Loaded the data
  • Inspected the data
  • Validate arguments coming into the function
  • Removed “Not Available” from rows
  • Filtered the data to the “outcome” specified by the arguments
  • Coerced the outcome column from char to numeric
  • So now let’s group the data into states and sort in order
  • Within each state we want sort mortality rate “outcome” in ascending order
  • We want to create a ranking for each row within that group/state
  • Basically, we want to know which the rank for each hospital within its state
  • If two hospitals are tied with the same rate, we need to choose the one that occurs first alphabetically

Let’s group the data by state, count the occurrences of observations in each state

groupdata <- gooddata |>
                group_by(gooddata$State) |>
                mutate(found = n())

Arrange within each group/state the rate (column 2) and if a tie exists, we need to choose the one that occurs first alphabetically

        rankedgroups <- groupdata |>
                arrange((groupdata[3]), groupdata[2], .by_group = TRUE) |>
                mutate(rank = row_number())

Here is what I did in the code above:

  • arranged first by [3] which is the rate, so now it’ll all be sorted in ascending order with the best(lowest value) on top
  • then we arrange it by [2] which is Hospital.Name so in case of a tie the one that occured first will have the higher rank
  • create a new column where we assign the rank of each hospital within that state

Conditions

If you remember above, I tested for the arguments but not all of them. I didn’t test for “best”, “worst” and how to extract that data. Let’s look at the code first and then I’ll explain

        if (num == "best") {num = 1}
        if (num == "worst") {num = rankedgroups$found}
        
        #filter rankedgroups to show rows with rank = input
        rankedout <- subset(rankedgroups, rankedgroups$rank == num)

If you remember from before:

  • I had created a column labeled found in the first group_by function. That value is the most rows that State had for that outcome, which would make it the max, in other words it’s the “worst” rank.
  • So if the user inputs “worst” we assign num = rankedgroups$found
  • We create a df with only the ranked value num so we use subset() with the condition rankedgroups$rank == num

extract group by rank

So now let’s create that df I mentioned a line up. We want to take the num argument and filter out the data to only show the hospitals that meet that rank. For example, if we want the top 5 hospitals for that outcome in each state we set num=5

rankedout <- subset(rankedgroups, rankedgroups$rank == num)

group list

Remember earlier we used several ways to extract a list of the groups/states and the count of occurrences/observations for each? Here we use it again

count_list <- datain |> 
                count(State) |> 
                rename( found = n)

merge lists

Now that we have a list of all the states (count_list) and a list of all the filtered, ranked Hospital Names for each State (rankedout) we can merge them together

  • Note: I’ll use count_list on the left because that’s the list of ALL states in the dataset
  • One of the conditions was to display any state that has missing values as NA
  • We don’t need all the columns from (rankedout) df so we’ll just select the two we want
 almostlist <- merge(count_list, rankedout[,c("State", "Hospital.Name")], by= "State", all.x = TRUE)

drop column

count_list had a column (foun) that is no longer needed so we can drop it

almostlist <- subset(almostlist, select= -c(found))

ENTIRE FUNCTION

Rankall

I’ve removed all the comments since we covered everything in detail up above.

# select() is in tidyverse so load it
library(tidyverse)
library(dplyr)
#_______________Let's copy the function from before, that reads our datain
getfile <- function(file){
        fileDir <- setwd("~/CompareCare/")
        wantedFile = file.path(fileDir,file,fsep="/")
        return(read.csv(wantedFile))
}
datain <- getfile("outcome-of-care-measures.csv")

rankall <- function(outcome, num = "best") {
        if (!((outcome == "heart attack") |
            (outcome == "heart failure") |
            (outcome == "pneumonia"))) {stop("invalid outcome")}
        
        if (outcome == "heart attack") {usethis = 11}
        if (outcome == "heart failure") {usethis = 17}
        if (outcome == "pneumonia") {usethis = 23}
        
        subdata <- select(datain, c("State", "Hospital.Name", all_of(usethis))) 
        
        gooddata <-  subset(subdata, subdata[3] != "Not Available")
        
        gooddata[3] <- as.numeric(unlist(gooddata[3]))
        
        groupdata <- gooddata |>
                group_by(gooddata$State) |>
                mutate(found = n())
        
        rankedgroups <- groupdata |>
                arrange((groupdata[3]), groupdata[2], .by_group = TRUE) |>
                mutate(rank = row_number())
        
        if (num == "best") {num = 1}
        if (num == "worst") {num = rankedgroups$found}
        
        rankedout <- subset(rankedgroups, rankedgroups$rank == num)
        
        count_list <- datain |> 
                count(State) |> 
                rename( found = n)
      
        almostlist <- merge(count_list, rankedout[,c("State", "Hospital.Name")], by= "State", all.x = TRUE)        
        almostlist <- subset(almostlist, select= -c(found))        
}
c <- rankall("pneumonia",20)

SUBSET MULTI COND

Data

Load the data from here https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Fss06hid.csv and load the data into R. The code book is at https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FPUMSDataDict06.pdf

Business Case

  • Create a logical vector that identifies the households on greater than 10 acres who sold more than $10,000 worth of agriculture products
  • Assign that logical vector to the variable agricultureLogical
  • Apply which() to identify the rows of the data frame where the logical vector is TRUE.
  • Provide the first 3 row IDs that meet the conditions
observations

After reviewing the code book, the columns that I need are as follows:

  • AGS = 6 is the column for sales of agriculture products >10000
  • ACR = 3 is for house on 10 acres or more
  • I’ll use subset() to filter out the conditions into a new df
  • I’ll use which() as well to confirm the answers

load data

I’ll use the function I created earlier to load the data after downloading to my local drive.

which

getc3w3 <- function(file){
        fileDir <- setwd("D:/Education/R/Data/coursera/")
        wantedFile = file.path(fileDir,file,fsep="/")
        return(read.csv(wantedFile))
}

c3_w3 <- getc3w3("course3_week3.csv")
agri <- subset(c3_w3,AGS==6 & ACR==3)
head(agri, n=3)
125, 238, 262

#_______ I'll use which() to confirm results
blah <- c3_w3[which(c3_w3$AGS==6 & c3_w3$ACR==3),]
head(blah, n=3)
125, 238,262

JPEG

Business Case

  • Install jpeg package
  • Download a specified jpeg to the local drive
  • Calculate the 30 and 80%

jpeg

quantile

install.packages("jpeg")
library(jpeg)
getimage<- function(file){
        fileDir <- setwd("D:/Education/R/Data/coursera/")
        wantedFile = file.path(fileDir,file,fsep="/")
        return(readJPEG(wantedFile, native = TRUE))
}
image <- getimage("getdata_jeff.jpg")
quantile(image,c(0.3,0.8))
    30%       80% 
-15259150 -10575416

INTERSECT

Business Case

intersect

merge

    getfile <- function(file){
            fileDir <- setwd("D:/Education/R/Data/coursera/")
            wantedFile = file.path(fileDir,file,fsep="/")
            return(read.csv(wantedFile))
    }
gdp <- getfile("getdata_data_GDP.csv")
edu <- getfile("getdata_data_EDSTATS_Country.csv")

# compare country short codes
short_intersect <- intersect(gdp$X.2, edu$Short.Name)
View(short_intersect)

library(dplyr)
merged <- merge(gdp, edu, by.x = "X", by.y = "CountryCode")
names(merged)    #to find the names I need

merged <- merged |>
        select("X","Gross.domestic.product.2012","Long.Name","Short.Name", "Income.Group" )
merged <- merged |> 
        arrange(desc(as.numeric(Gross.domestic.product.2012)))


#  Eliminate rows that don't have a value for GDP
merged <- merged |> 
        filter(merged$Gross.domestic.product.2012>= 1)
189 rows = matches, 13th country is St Kitts and Nevis

SUMMARISE

Business Case

  • What’s the average GDP ranking for the High Income: OECD and High Income: nonOCED groups?

summarise

str(merged)    #just to refresh my memory as to the column type
avgtable <- merged |>  
        group_by(Income.Group) |> 
        summarise(av = mean(as.numeric(Gross.domestic.product.2012)))    
        
# A tibble: 5 × 2
Income.Group             av
                
1 High income: OECD     33.0
2 High income: nonOECD  91.9
3 Low income           134. 
4 Lower middle income  108. 
5 Upper middle income   92.1

CUT

Business Case

  • Cut the GDP rankings into 5 separate groups
  • Make a table for rankings vs Income.Group
  • How many countries are Lower Middle Income but are among the 38 nations with the Highest GDP
  • Note the Highest GDP is the US = 1 rank, so I need to look at 1-38
  • Install Hmisc package

cut

breaks

quantile

table

install.packages("Hmisc")
library(Hmisc)

GDP <- as.numeric(merged$Gross.domestic.product.2012)

#_____ The next line is just to check the distribution ONLY not part of the solution
cutGDP =quantile(as.numeric(merged$Gross.domestic.product.2012), na.rm = TRUE)
 0% 25% 50% 75% 100%
  1 48   95 143 190

#_________The next block is not the answer either, I am just showing the difference between using quantile and 5 breaks
cGDP <- cut( merged, breaks = quantile(GDP))

cutup <- cut(as.numeric(merged$Gross.domestic.product.2012), breaks=quantile(as.numeric(merged$Gross.domestic.product.2012)))
[1] (143,190] (143,190] (143,190] (143,190] (143,190] (143,190] (143,190] (143,190] (143,190] (143,190] (143,190] (143,190]
[13] (143,190] (143,190] (143,190] (143,190] (143,190] (143,190] (143,190] (143,190] (143,190] (143,190] (143,190] (143,190]
[25] (143,190] (143,190] (143,190] (143,190] (143,190] (143,190] (143,190] (143,190] (143,190] (143,190] (143,190] (143,190]
[37] (143,190] (143,190] (143,190] (143,190] (143,190] (143,190] (143,190] (143,190] (143,190] (143,190] (143,190] (95,143] 
[49] (95,143]  (95,143]  (95,143]  (95,143]  (95,143]  (95,143]  (95,143]  (95,143]  (95,143]  (95,143]  (95,143]  (95,143] 
[61] (95,143]  (95,143]  (95,143]  (95,143]  (95,143]  (95,143]  (95,143]  (95,143]  (95,143]  (95,143]  (95,143]  (95,143] 
[73] (95,143]  (95,143]  (95,143]  (95,143]  (95,143]  (95,143]  (95,143]  (95,143]  (95,143]  (95,143]  (95,143]  (95,143] 
[85] (95,143]  (95,143]  (95,143]  (95,143]  (95,143]  (95,143]  (95,143]  (95,143]  (95,143]  (95,143]  (48,95]   (48,95]  
[97] (48,95]   (48,95]   (48,95]   (48,95]   (48,95]   (48,95]   (48,95]   (48,95]   (48,95]   (48,95]   (48,95]   (48,95]  
[109] (48,95]   (48,95]   (48,95]   (48,95]   (48,95]   (48,95]   (48,95]   (48,95]   (48,95]   (48,95]   (48,95]   (48,95]  
[121] (48,95]   (48,95]   (48,95]   (48,95]   (48,95]   (48,95]   (48,95]   (48,95]   (48,95]   (48,95]   (48,95]   (48,95]  
[133] (48,95]   (48,95]   (48,95]   (48,95]   (48,95]   (48,95]   (48,95]   (48,95]   (48,95]   (1,48]    (1,48]    (1,48]   
[145] (1,48]    (1,48]    (1,48]    (1,48]    (1,48]    (1,48]    (1,48]    (1,48]    (1,48]    (1,48]    (1,48]    (1,48]   
[157] (1,48]    (1,48]    (1,48]    (1,48]    (1,48]    (1,48]    (1,48]    (1,48]    (1,48]    (1,48]    (1,48]    (1,48]   
[169] (1,48]    (1,48]    (1,48]    (1,48]    (1,48]    (1,48]    (1,48]    (1,48]    (1,48]    (1,48]    (1,48]    (1,48]   
[181] (1,48]    (1,48]    (1,48]    (1,48]    (1,48]    (1,48]    (1,48]    (1,48]         
        Levels: (1,48] (48,95] (95,143] (143,190]

#_____ The BC is for 5 levels so let's cut it with breaks=5
cutup2 <- cut(as.numeric(merged$Gross.domestic.product.2012), breaks=5)
[1] (152,190]    (152,190]    (152,190]    (152,190]    (152,190]    (152,190]    (152,190]    (152,190]    (152,190]   
[10] (152,190]    (152,190]    (152,190]    (152,190]    (152,190]    (152,190]    (152,190]    (152,190]    (152,190]   
[19] (152,190]    (152,190]    (152,190]    (152,190]    (152,190]    (152,190]    (152,190]    (152,190]    (152,190]   
[28] (152,190]    (152,190]    (152,190]    (152,190]    (152,190]    (152,190]    (152,190]    (152,190]    (152,190]   
[37] (152,190]    (152,190]    (114,152]    (114,152]    (114,152]    (114,152]    (114,152]    (114,152]    (114,152]   
[46] (114,152]    (114,152]    (114,152]    (114,152]    (114,152]    (114,152]    (114,152]    (114,152]    (114,152]   
[55] (114,152]    (114,152]    (114,152]    (114,152]    (114,152]    (114,152]    (114,152]    (114,152]    (114,152]   
[64] (114,152]    (114,152]    (114,152]    (114,152]    (114,152]    (114,152]    (114,152]    (114,152]    (114,152]   
[73] (114,152]    (114,152]    (114,152]    (76.6,114]   (76.6,114]   (76.6,114]   (76.6,114]   (76.6,114]   (76.6,114]  
[82] (76.6,114]   (76.6,114]   (76.6,114]   (76.6,114]   (76.6,114]   (76.6,114]   (76.6,114]   (76.6,114]   (76.6,114]  
[91] (76.6,114]   (76.6,114]   (76.6,114]   (76.6,114]   (76.6,114]   (76.6,114]   (76.6,114]   (76.6,114]   (76.6,114]  
[100] (76.6,114]   (76.6,114]   (76.6,114]   (76.6,114]   (76.6,114]   (76.6,114]   (76.6,114]   (76.6,114]   (76.6,114]  
[109] (76.6,114]   (76.6,114]   (76.6,114]   (76.6,114]   (76.6,114]   (38.8,76.6]  (38.8,76.6]  (38.8,76.6]  (38.8,76.6] 
[118] (38.8,76.6]  (38.8,76.6]  (38.8,76.6]  (38.8,76.6]  (38.8,76.6]  (38.8,76.6]  (38.8,76.6]  (38.8,76.6]  (38.8,76.6] 
[127] (38.8,76.6]  (38.8,76.6]  (38.8,76.6]  (38.8,76.6]  (38.8,76.6]  (38.8,76.6]  (38.8,76.6]  (38.8,76.6]  (38.8,76.6] 
[136] (38.8,76.6]  (38.8,76.6]  (38.8,76.6]  (38.8,76.6]  (38.8,76.6]  (38.8,76.6]  (38.8,76.6]  (38.8,76.6]  (38.8,76.6] 
[145] (38.8,76.6]  (38.8,76.6]  (38.8,76.6]  (38.8,76.6]  (38.8,76.6]  (38.8,76.6]  (38.8,76.6]  (0.811,38.8] (0.811,38.8]
[154] (0.811,38.8] (0.811,38.8] (0.811,38.8] (0.811,38.8] (0.811,38.8] (0.811,38.8] (0.811,38.8] (0.811,38.8] (0.811,38.8]
[163] (0.811,38.8] (0.811,38.8] (0.811,38.8] (0.811,38.8] (0.811,38.8] (0.811,38.8] (0.811,38.8] (0.811,38.8] (0.811,38.8]
[172] (0.811,38.8] (0.811,38.8] (0.811,38.8] (0.811,38.8] (0.811,38.8] (0.811,38.8] (0.811,38.8] (0.811,38.8] (0.811,38.8]
[181] (0.811,38.8] (0.811,38.8] (0.811,38.8] (0.811,38.8] (0.811,38.8] (0.811,38.8] (0.811,38.8] (0.811,38.8] (0.811,38.8]
Levels: (0.811,38.8] (38.8,76.6] (76.6,114] (114,152] (152,190]

ANSWER <- table(cutup2,merged$Income.Group )

cutup2         High income: nonOECD High income: OECD Low income Lower middle income Upper middle income
(0.811,38.8]                    4                18          0                   5                  11
(38.8,76.6]                     5                10          1                  13                   9
(76.6,114]                      8                 1          9                  12                   8
(114,152]                       4                 1         16                   8                   8
(152,190]                       2                 0         11                  16                   9

#__________________  ANSWER IS LOWER MIDDLE INCOME AND FIRST ROW == 5 FROM TABLE ABOVE
ANSWER[1,4]
[1] 5

CROSS STREETS

See Vector post for more.

grep

Let’s say

grep("Alameda", df$intersection)
[1]  4  5  36
grep("Alameda", df$intersection, value=TRUE)
[1] "The Alameda & 33rd st"  "E 33rd & The Alameda"   "Harford...."
grep("Jeffers", df$intersection)
integer(0)
length(grep("Jeffers", df$intersection))
[1] 0

grepl

Will look for this version Alameda. It will look for it in this intersection variable. And it will return a vector that’s true whenever Alameda appears and false whenever Alameda doesn’t appear.

And so in this case you can see that in that three of the times the Alameda appears, and so if I make a table of the true and false values, it’s true three of the times.

table(grepl("Alameda", df$intersection)
FALSE   TRUE
  77       3

or example, suppose I want to find all the cases where Alameda appears in the intersection. And if Alameda doesn’t appear, then I want to subset to only that subset of the data.

I can do that using this grepl command. So you can use that to subset your data based on certain searches that you want to be able to find.

subset
subgroup <- df[!grepl("Alameda", df$intersection),]

C3W4 QUIZ- 5QUESTIONS

STRSPLIT

Data

The American Community Survey distributes downloadable data about United States communities. Download the 2006 microdata survey about housing for the state of Idaho using download.file() from here:
https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Fss06hid.csv

and load the data into R. The code book, describing the variable names is here:  https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FPUMSDataDict06.pdf

Business Case

  • Apply strsplit() to split all the names of the data frame on the characters “wgtp”.
  •  What is the value of the 123 element of the resulting list?
        getc3w4 <- function(file){
                fileDir <- setwd("D:/Education/R/Data/coursera/")
                wantedFile = file.path(fileDir,file,fsep="/")
                return(read.csv(wantedFile))
        }

c3_w4 <- getc3w4("getdata_data_ss06hid.csv")
wgtpsplit <- strsplit(names(c3_w4),"wgtp")
List of 188
$ : chr "RT"
$ : chr "SERIALNO"
$ : chr "DIVISION"
$ : chr "PUMA"
$ : chr "REGION"
$ : chr "ST"
$ : chr "ADJUST"
$ : chr "WGTP"
$ : chr "NP"
$ : chr "TYPE"
$ : chr "ACR"
$ : chr "AGS"
$ : chr "BDS"
$ : chr "BLD"
$ : chr "BUS"
wgtpsplit[123]
[[1]]
[1] ""   "15"

GSUB

Data

Load the Gross Domestic Product data for the 190 ranked countries in this data set:  https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FGDP.csv
Original data sources:
http://data.worldbank.org/data-catalog/GDP-ranking-table

Business Case

  • Remove the commas from the GDP numbers in millions of dollars and
  • Average them. What is the average?
library(dplyr)       
library(tidyr)        
        getgdp <- function(file){
                fileDir <- setwd("D:/Education/R/Data/coursera/")
                wantedFile = file.path(fileDir,file,fsep="/")
                return(read.csv(wantedFile))
        }
gdpp <- getgdp("getdata_data_GDP.csv")
gdpcol <- gdpp |> select(X.1, X.2,X.3)
#it appears that the real data starts at row 5 and stops at row 194
realdf <- gdpcol[5:194,]

# column X.3 is the one we need and is of type "char". Substitue , with ""
gdp11 <- gsub(",","",realdf$X.3)
df11 <- data.frame(gdp11)

mean(as.numeric(df11$gdp11))
[1] 377652.4
#____or
summarise(df11, mean=mean(as.numeric(gdp11)))
mean
1 377652.4

fread

#Could've used fread to import data directly into a table with
GDPrank <- data.table::fread('http://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FGDP.csv'
, skip=5
, nrows=190
, select = c(1, 2, 4, 5)
, col.names=c("CountryCode", "Rank", "Country", "GDP")
)

grep

Same data as above, Count the number countries whose name begins with United ?

grep("^United",countryNames), 3

MATCH EXPRESSION

Data

Load the Gross Domestic Product data for the 190 ranked countries in this data set:  https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FGDP.csv
Load the educational data from this data set: https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FEDSTATS_Country.csv from http://data.worldbank.org/data-catalog/ed-stats
Original data sources: http://data.worldbank.org/data-catalog/GDP-ranking-table

Business Case

  • Match/merge the data based on the country shortcodes
  • For the countries that have reported an “end of fiscal year”
  • That data could be extracted from column X
  • How many countries’ fiscal year ends in June?

merge

arrange

grep

getfile <- function(file){
        fileDir <- setwd("D:/Education/R/Data/coursera/")
        wantedFile = file.path(fileDir,file,fsep="/")
        return(read.csv(wantedFile))
}
gdp <- getfile("getdata_data_GDP.csv")
edu <- getfile("getdata_data_EDSTATS_Country.csv")
library(dplyr)
merged <- merge(gdp, edu, by.x = "X", by.y = "CountryCode")
names(merged)
merged <- merged |>
        select("X","Gross.domestic.product.2012","Long.Name","Short.Name", "Special.Notes" )
merged <- merged |> 
        arrange(desc(Special.Notes))
#Now we look for how many rows match "end: June"
grep("end: June",merged$Special.Notes)
[1] 48 49 50 51 52 53 54 55 56 57 58 59 60
#let's get the values
grep("end: June",merged$Special.Notes, value = TRUE)
[1] "Fiscal year end: June 30; reporting period for national accounts data: FY."
[2] "Fiscal year end: June 30; reporting period for national accounts data: FY."
[3] "Fiscal year end: June 30; reporting period for national accounts data: FY."
[4] "Fiscal year end: June 30; reporting period for national accounts data: FY."
[5] "Fiscal year end: June 30; reporting period for national accounts data: FY."
[6] "Fiscal year end: June 30; reporting period for national accounts data: FY."
[7] "Fiscal year end: June 30; reporting period for national accounts data: FY."
[8] "Fiscal year end: June 30; reporting period for national accounts data: CY."
[9] "Fiscal year end: June 30; reporting period for national accounts data: CY."
[10] "Fiscal year end: June 30; reporting period for national accounts data: CY."
[11] "Fiscal year end: June 30; reporting period for national accounts data: CY."
[12] "Fiscal year end: June 30; reporting period for national accounts data: CY."
[13] "Fiscal year end: June 30; reporting period for national accounts data: CY."

LUBRIDATE

Data

You can use the quantmod (http://www.quantmod.com/) package to get historical stock prices for publicly traded companies on the NASDAQ and NYSE. Use the following code to download data on Amazon’s stock price and get the times the data was sampled.

install.packages("quantmod")
library(quantmod)
library(lubridate)
amzn = getSymbols("AMZN",auto.assign=FALSE)
sampleTimes = index(amzn)
str(sampleTimes)
Date[1:4387], format: "2007-01-03" "2007-01-04" "2007-01-05" "2007-01-08" "2007-01-09" 

timedf <- data.frame(sampleTimes)
timedf <- timedf |>  mutate(year = year(sampleTimes)) |> mutate(day = weekdays(sampleTimes))

Business Case

  • How many values were collected in 2012?
  • How many values were collected on Mondays in 2012?
answerdf = which(timedf$year == 2012 )
length(answerdf)
[1] 250
#____ OR 2012
yeardf <- subset(timedf, timedf$year == 2012)
nrow(yeardf)
[1] 250
#_____ 2012 & MONDAYS ONLY
ans <- subset(timedf, timedf$year == 2012 & timedf$day == "Monday" )
nrow(ans)
[1] 47