setwd("C:/Workshop/Data")
movies <- read.table(
file = "Movies.txt",
sep = "\t",
header = TRUE,
quote = "\"")
head(movies)
## Title Year Rating Runtime Tomato.Meter Box.Office
## 1 The Whole Nine Yards 2000 R 98 min 45 $57.3M
## 2 Gladiator 2000 R 155 min 76 $187.3M
## 3 Cirque du Soleil 2000 G 39 min 45 $13.4M
## 4 Dinosaur 2000 PG 82 min 65 $135.6M
## 5 Big Momma's House 2000 PG-13 99 min 30 $0.5M
## 6 Gone in Sixty Seconds 2000 PG-13 118 min 24 $101.0M
names(movies)
## [1] "Title" "Year" "Rating" "Runtime"
## [5] "Tomato.Meter" "Box.Office"
names(movies)[5]
## [1] "Tomato.Meter"
names(movies)[5] <- "Critic.Score"
names(movies)
## [1] "Title" "Year" "Rating" "Runtime"
## [5] "Critic.Score" "Box.Office"
sum(is.na(movies))
## [1] 4
movies <- na.omit(movies)
sum(is.na(movies))
## [1] 0
head(movies$Runtime)
## [1] 98 min 155 min 39 min 82 min 99 min 118 min
## 114 Levels: 100 min 101 min 102 min 103 min 104 min 105 min ... 99 min
mean(movies$Runtime)
## Warning in mean.default(movies$Runtime): argument is not numeric or
## logical: returning NA
## [1] NA
class(movies$Runtime)
## [1] "factor"
runtimes <- as.character(movies$Runtime)
head(runtimes)
## [1] "98 min" "155 min" "39 min" "82 min" "99 min" "118 min"
class(runtimes)
## [1] "character"
runtimes <- sub(" min", "", runtimes)
head(runtimes)
## [1] "98" "155" "39" "82" "99" "118"
movies$Runtime <- as.integer(runtimes)
head(movies$Runtime)
## [1] 98 155 39 82 99 118
class(movies$Runtime)
## [1] "integer"
mean(movies$Runtime)
## [1] 104.4052
head(movies$Box.Office)
## [1] $57.3M $187.3M $13.4M $135.6M $0.5M $101.0M
## 1367 Levels: $0.1M $0.2M $0.3M $0.4M $0.5M $0.6M $0.7M $0.8M $0.9M ... $99.9k
convertBoxOffice <- function(boxOffice)
{
stringBoxOffice <- as.character(boxOffice)
replacedBoxOffice <- gsub("[$|k|M]", "", stringBoxOffice)
numericBoxOffice <- as.numeric(replacedBoxOffice)
if (grepl("M", boxOffice)) {
numericBoxOffice
} else if (grepl("k", boxOffice)){
numericBoxOffice * 0.001
} else {
numericBoxOffice * 0.000001
}
}
movies$Box.Office <- sapply(movies$Box.Office, convertBoxOffice)
head(movies$Box.Office)
## [1] 57.3 187.3 13.4 135.6 0.5 101.0
class(movies$Box.Office)
## [1] "numeric"
mean(movies$Box.Office)
## [1] 40.67558
write.csv(movies, "Movies2.csv")
Verify the new CSV file exists in your C:/Workshop directory.
Verify the file’s contents by opening the file.
library(dplyr)
temp <- select(movies, Year, Rating, Box.Office)
temp <- filter(temp, Year == 2014)
## Warning: package 'bindrcpp' was built under R version 3.4.1
temp <- mutate(temp, Revenue = Box.Office / 1000)
temp <- group_by(temp, Rating)
temp <- summarize(temp, Total.Revenue = sum(Revenue))
temp <- arrange(temp, desc(Total.Revenue))
print(temp)
## # A tibble: 4 x 2
## Rating Total.Revenue
## <fctr> <dbl>
## 1 PG-13 4.207022
## 2 R 1.985307
## 3 PG 1.931452
## 4 G 0.159132
report <- movies %>%
select(Year, Rating, Box.Office) %>%
filter(Year == 2014) %>%
mutate(Revenue = Box.Office / 1000) %>%
group_by(Rating) %>%
summarize(Total.Revenue = sum(Revenue)) %>%
arrange(desc(Total.Revenue)) %>%
as.data.frame()
print (report)
## Rating Total.Revenue
## 1 PG-13 4.207022
## 2 R 1.985307
## 3 PG 1.931452
## 4 G 0.159132