The raw data “activity.csv” to be included in the GitHub repo. This .csv file is to be extracted from the activity.zip file. The file can also be downloaded from: https://www.coursera.org/learn/reproducible-research/peer/gYyPt/course-project-1
Eitherway, the working directory should be set to the directory that contains “activity.csv”
After setting your working directory:
#getwd('YOUR DIRECTORY PATH HERE')
Then you can load activity data using “read.csv”
RawActivityData<-read.csv(paste0(getwd(),'/activity.csv'))
The following displays a small subset of the data to show how “interval” fields relate to 5 minute intervals in a 24 hour date:
RawActivityData[861:868,]
## steps date interval
## 861 0 2012-10-03 2340
## 862 8 2012-10-03 2345
## 863 0 2012-10-03 2350
## 864 0 2012-10-03 2355
## 865 47 2012-10-04 0
## 866 0 2012-10-04 5
## 867 0 2012-10-04 10
## 868 0 2012-10-04 15
Histogram of the total number of steps taken each day
StepsByDay<-aggregate(list(Steps=RawActivityData$steps),list(Date=RawActivityData$date),sum)
hist(StepsByDay$Steps, col="blue", breaks=20)
What is mean total number of steps taken per day?
mean(StepsByDay$Steps, na.rm=TRUE)
## [1] 10766.19
What is the median number of steps taken per day?
median(StepsByDay$Steps, na.rm=TRUE)
## [1] 10765
Make a time series plot (i.e. 𝚝𝚢𝚙𝚎 = “𝚕”) of the 5-minute interval (x-axis) and the average number of steps taken, averaged across all days (y-axis)
ByInterval<-aggregate(list(Steps=RawActivityData$steps),list(Interval=RawActivityData$interval),na.rm=TRUE, na.action=NULL,mean)
plot(ByInterval)
Which 5-minute interval, on average across all the days in the dataset, contains the maximum number of steps?
ByInterval[ByInterval$Steps == max(ByInterval$Steps),]
## Interval Steps
## 104 835 206.1698
On average 8:35-8:40AM has the maximum number of streps.
Number of missing cases:
nrow(RawActivityData)-NROW(na.omit(RawActivityData))
## [1] 2304
Replacing data with mean of interval
library(dplyr)
#Function to replace with mean of interval
myreplace<-function(interval){
as.integer(ByInterval$Steps[match(interval, ByInterval$Interval)])
}
FilledActivityData<-mutate(RawActivityData,steps = ifelse(!complete.cases(RawActivityData),myreplace(RawActivityData$interval), RawActivityData$steps))
This shows a small subset of filled data.
FilledActivityData[2250:2260,]
## steps date interval
## 2250 20 2012-10-08 1925
## 2251 27 2012-10-08 1930
## 2252 40 2012-10-08 1935
## 2253 30 2012-10-08 1940
## 2254 25 2012-10-08 1945
## 2255 45 2012-10-08 1950
## 2256 33 2012-10-08 1955
## 2257 19 2012-10-08 2000
## 2258 19 2012-10-08 2005
## 2259 19 2012-10-08 2010
## 2260 33 2012-10-08 2015
library(ggplot2)
#RawDataActivity with set marked as missing to show it has not been filled
ByDateMissing<-aggregate(list(Steps=RawActivityData$steps),list(Date=RawActivityData$date),sum)
ByDateMissing$set<-'missing'
#Filled DataActivity had NA data filled with mean of interval
ByDateFilled<-aggregate(list(Steps=FilledActivityData$steps),list(Date=FilledActivityData$date),sum)
ByDateFilled$set<-'filled'
compare<-rbind(ByDateFilled,ByDateMissing)
ggplot(compare, aes(Steps, fill = set)) + geom_histogram(alpha = 0.5, aes(y = ..density..), position = 'identity')
What is mean total number of steps taken per day?
mean(ByDateFilled$Steps, na.rm=TRUE)
## [1] 10749.77
This number is different from omitted NA data by:
mean(ByDateFilled$Steps, na.rm=TRUE)-mean(ByDateMissing$Steps, na.rm=TRUE)
## [1] -16.41819
What is the median number of steps taken per day?
median(ByDateFilled$Steps, na.rm=TRUE)
## [1] 10641
This number is different from omitted NA data by:
median(ByDateFilled$Steps, na.rm=TRUE)-median(ByDateMissing$Steps, na.rm=TRUE)
## [1] -124
library(dplyr)
library(lattice)
#Function to tag a date as a "weekend" or "weekday"
daytype<-function(daystring)
{
weekend<-c("Saturday","Sunday")
ifelse(is.element(as.character(daystring),weekend),"weekend","weekday")
}
#converting date fields to "Date" data type
ByDateFilled$Date<-as.Date(ByDateFilled$Date)
FilledActivityData$date<-as.Date(FilledActivityData$date)
DayAddedActivityData<-mutate(FilledActivityData, DayOfWeek = weekdays(FilledActivityData$date))
DayAddedActivityData<-mutate(DayAddedActivityData,WeekType=daytype(DayAddedActivityData$DayOfWeek))
DayCompare<-aggregate(data=DayAddedActivityData,list(Steps=DayAddedActivityData$steps),list(days=DayAddedActivityData$WeekType, interval=DayAddedActivityData$interval),na.rm=TRUE, na.action=NULL,mean)
#for printing debug
#weekdays(ByDateFilled$Date)
#Adding a DayOfWeek column with "weekend" or "weekday" tag
ByDateFilled<-mutate(ByDateFilled, DayOfWeek = weekdays(ByDateFilled$Date))
##ByDay
MeanByDayOfWeek<-aggregate(list(Steps=ByDateFilled$Steps),list(Days=ByDateFilled$DayOfWeek),na.rm=TRUE, na.action=NULL,mean)
DataWithDayType<-mutate(ByDateFilled, WeekType=daytype(ByDateFilled$DayOfWeek))
xyplot(Steps ~ interval | days, type="l",data = DayCompare, layout = c(1, 2))