##############################3 # example of salary data # input the data into a dataframe 'salary' sal = read.table( "salary.dat",header=T); # look at the first six rows of data head(sal) #salary is a dataframe, we can look the structure of this dataframe str(sal) #There 11 columns in this dataframe, we can pick individual column sal$gender; #Gender is an array (vector) of F & M values head(sal$gender); #this gives the first 6 values; parenthesis are for functions sal$gender[10:15]; #This list 10th-15th values; square brakets are for vectors sal[2,3]; #List the value at the 2nd row, the 3rd column sal[10:15, 2:4]; #list values from 10-15 rows and 2-4 columns #how many Females and Males in this data set? table( sal$gender ); # hist( sal$salary); # What is a histogram # Question: How long did it take to get this job after degree? interval = sal$startyr - sal$yrdeg hist( sal$interval ); # Apparently, some people started working before they have # got their degrees. ####################################### ###Question: Is there a gender bias in salary? ####################################### #first, we can look at a plot of salary by gender boxplot( sal$salary ~ sal$gender ); # Partition the salary by gender fs = sal$salary[sal$gender=='F'] #females' salary ms = sal$salary[sal$gender=='M'] #males' salary mean(fs) mean(ms) # try t-test t.test( ms, fs ) t.test( log(ms), log(fs)); #t-test on log-transformed data #using linear regression to examine the association m = lm( sal$salary ~ sal$gender ); #look at the sumamry report of the regression results summary(m); # OR, we do this in a different way s = sal$salary g = sal$gender m = lm( s ~ g ); summary(m); # R^2 = 0.01804 #What is R^2? ss_tot = sum( (s - mean(s))^2 ); fitted_values = predict( m ); ss_reg = sum( (fitted_values - mean(fitted_values))^2 ) ss_err = sum( (salary - fitted_values)^2) ss_reg / ss_tot; #This should the same as R^2 # work on your own, examine the effect of rank on salary # ... ... # ... ... # So, rank is much better associated with salary than gender. # Well, what's going about salary, rank, and gender? table( sal$gender, sal$rank ); #what does this mean? tb = table( sal$gender, sal$rank ); #this is the rank ~ gender table, the observation #chisq.test to see whether gender -> rank chisq.test( tb ); # what does this mean, pay attention to p-value # Let's calcuate the chisq manually #manual calculation of chisq test fp = sum(tb[1,])/sum(tb) #femal fraction in the sample mp = sum(tb[2,])/sum(tb) # male fraction in the sample atp = sum(tb[,1])/sum(tb) #assitant fraction asp = sum(tb[,2])/sum(tb) #associate fraction flp = sum(tb[,3])/sum(tb) #full professor fraction col1= sum(tb) * atp * c(fp, mp); #expectation, assistant prof col2= sum(tb) * asp * c(fp, mp); #expectation, associate prof col3= sum(tb) * flp * c(fp, mp); # expectation, full prof E = matrix( c(col1, col2, col3), nrow=2) #This is the expectation #In later sesseions, you will know tricks to do the above # calculation more efficiently using 'apply'. O = tb; #rename 'tb' as the Observations c = sum( (O-E)^2/E ) #this is the chisquare value 1 - pchisq( c, df= 2); #this is the p value #this should be the same as chisq.test #Q, what is the difference between t-test and chisq test? ## ... ... # another example of chisq test x = matrix ( c( 890, 310, 900, 300), nrow=2); chisq.test( x ) #fisher.test () fisher.test(tb); #run out of memory. #What is/are our new hypothesis or hypothyses? # H0: salary is indepedent of rank, gender # H1a: gender -> salary <- rank # H1b: gender -> rank -> salary # multiple regression summary( lm( sal$salary ~ sal$gender + sal$rank )) summary( lm( sal$rank ~ sal$gender )) # A more sophisticated way to compare the likelihood # of each models. We discuss the likelihood methods in # in late parts of this course.