Import required libraries
library("tidyverse")
Registered S3 method overwritten by 'dplyr':
method from
print.rowwise_df
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
[30m── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──[39m
[30m[32m✓[30m [34mggplot2[30m 3.3.0 [32m✓[30m [34mpurrr [30m 0.3.3
[32m✓[30m [34mtibble [30m 3.0.0 [32m✓[30m [34mdplyr [30m 0.8.5
[32m✓[30m [34mtidyr [30m 1.0.2 [32m✓[30m [34mstringr[30m 1.4.0
[32m✓[30m [34mreadr [30m 1.3.1 [32m✓[30m [34mforcats[30m 0.5.0[39m
[30m── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31mx[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31mx[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()[39m
library("dplyr")
You can find more information on dataset_1 here. Special Thanks to Zack Thoutt for creating this dataset for us to use!
# read data and drop ID col
dataset_1 <- read.csv("../raw_data/dataset_1/winemag-data-130k-v2.csv")[-1]
You can find more information on dataset_2 here. Special Thanks to Sanyam Kapoor for creating this dataset for us to use!
Combine all the csv files for dataset_2
# function reads csv values
readfile <- function(filename){
cat("Reading file: ", filename, "...\n", sep = '' )
return(read.csv(file = filename, header = TRUE))
}
# function merges two dataframes together
merge_dataframes <- function(dataframe_1, dataframe_2){
return(rbind(dataframe_1, dataframe_2))
}
# funciton merges all csv files in folder to one dataframe
combine_data = function(mypath){
filenames=list.files(path=mypath, full.names=TRUE, pattern="*.csv")
datalist = lapply(filenames, function(x) readfile(x))
return(Reduce(f = function(x,y) merge_dataframes(x,y), x = datalist, accumulate = FALSE))
}
# merge csv data given dataset folder
dataset_2 <- combine_data("../raw_data/dataset_2")
Reading file: ../raw_data/dataset_2/winemag-1-800.csv...
Reading file: ../raw_data/dataset_2/winemag-10401-11200.csv...
Reading file: ../raw_data/dataset_2/winemag-11201-12000.csv...
Reading file: ../raw_data/dataset_2/winemag-12001-12800.csv...
Reading file: ../raw_data/dataset_2/winemag-1601-2400.csv...
Reading file: ../raw_data/dataset_2/winemag-2401-3200.csv...
Reading file: ../raw_data/dataset_2/winemag-3201-4000.csv...
Reading file: ../raw_data/dataset_2/winemag-4001-4800.csv...
Reading file: ../raw_data/dataset_2/winemag-4801-5600.csv...
Reading file: ../raw_data/dataset_2/winemag-5601-6400.csv...
Reading file: ../raw_data/dataset_2/winemag-6401-7200.csv...
Reading file: ../raw_data/dataset_2/winemag-7201-8000.csv...
Reading file: ../raw_data/dataset_2/winemag-8001-8800.csv...
Reading file: ../raw_data/dataset_2/winemag-801-1600.csv...
Reading file: ../raw_data/dataset_2/winemag-8801-9600.csv...
Reading file: ../raw_data/dataset_2/winemag-9601-10400.csv...
Join datasets by title, description, price, country, points = rating. We choose to join by these characteristics, because it would allow us to safely assume that the wine reviews being merged are the same. However, we intentionally left choose not to join based on certain chacteristics like province = region, because there were some input errors in the data. For example, in dataset_1 the province was “Sicily & Sardinia” where as in dataset_2 the region was “Sicily & Sardinia”.
join_wines <- inner_join(dataset_1, dataset_2, by = c("title", "description", "price", "country", "points" = "rating"))
Column `title` joining factors with different levels, coercing to character vectorColumn `description` joining factors with different levels, coercing to character vectorColumn `country` joining factors with different levels, coercing to character vector
Visualize joined data
head(join_wines)
Clean joined data to remove redundancies fields
join_wines_cleaned <- join_wines %>%
select(country, description, designation=designation.x, points, price, province, region=region_1, subregion=region_2, taster_name, taster_twitter_handle, title, variety, winery=winery.x, alcohol, category, url, vintage)
head(join_wines_cleaned)