This file provides essential R code to help users get started with data manipulation, analysis, and visualization tasks.

Load dataset

Load a csv file

# mtcars is a built-in dataset in R
# save a csv file
write.csv(mtcars, "/Users/tranchau/Documents/OMG_tutorial/dataset/data.csv") 

# load a csv file
data_csv <- read.csv("/Users/tranchau/Documents/OMG_tutorial/dataset/data.csv", header = TRUE, row.names = 1) 
# use absolute path which is the complete path to a file or directory from the root directory
# 'header = TRUE' to read the first row as column names, 'row.names = 1' to set the first column as row names

Load a TSV (Tab-Separated Values) file

# save tsv file
write.table(mtcars, "/Users/tranchau/Documents/OMG_tutorial/dataset/data.tsv", sep = "\t", quote = FALSE)

# load a tsv file
data_tsv <- read.table("/Users/tranchau/Documents/OMG_tutorial/dataset/data.tsv", header = TRUE, row.names = 1, sep = "\t")
# 'sep = "\t"' specifies tab-separated values

Load an Excel file (.xlsx, .xls)

#install.packages("readxl", "writexl") 
# 'readxl' is not a built-in package in R. It needs to be installed before using it
library(writexl)
write_xlsx(mtcars, "/Users/tranchau/Documents/OMG_tutorial/dataset/data.xlsx")

library(readxl)
data_excel <- read_excel("/Users/tranchau/Documents/OMG_tutorial/dataset/data.xlsx", sheet = 1)  
# 'sheet' specifies the sheet number or name

Load a RDS file, format for a single R object

# Save data as RDS file
saveRDS(mtcars, "/Users/tranchau/Documents/OMG_tutorial/dataset/data.rds")

# Load RDS file
data_rds <- readRDS("/Users/tranchau/Documents/OMG_tutorial/dataset/data.rds")

Load RData, save multiple R objects

# Save multiple objects to an RData file
save(mtcars, mtcars, file = "/Users/tranchau/Documents/OMG_tutorial/dataset/data.RData")

# Load an RData file
load("/Users/tranchau/Documents/OMG_tutorial/dataset/data.RData")

Data inspection

data(mtcars)  # Loads the mtcars dataset
head(mtcars)  # Displays the first few rows
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
str(mtcars)          # Shows structure of the data
## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...
summary(mtcars)      # Provides a summary of each column
##       mpg             cyl             disp             hp       
##  Min.   :10.40   Min.   :4.000   Min.   : 71.1   Min.   : 52.0  
##  1st Qu.:15.43   1st Qu.:4.000   1st Qu.:120.8   1st Qu.: 96.5  
##  Median :19.20   Median :6.000   Median :196.3   Median :123.0  
##  Mean   :20.09   Mean   :6.188   Mean   :230.7   Mean   :146.7  
##  3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:326.0   3rd Qu.:180.0  
##  Max.   :33.90   Max.   :8.000   Max.   :472.0   Max.   :335.0  
##       drat             wt             qsec             vs        
##  Min.   :2.760   Min.   :1.513   Min.   :14.50   Min.   :0.0000  
##  1st Qu.:3.080   1st Qu.:2.581   1st Qu.:16.89   1st Qu.:0.0000  
##  Median :3.695   Median :3.325   Median :17.71   Median :0.0000  
##  Mean   :3.597   Mean   :3.217   Mean   :17.85   Mean   :0.4375  
##  3rd Qu.:3.920   3rd Qu.:3.610   3rd Qu.:18.90   3rd Qu.:1.0000  
##  Max.   :4.930   Max.   :5.424   Max.   :22.90   Max.   :1.0000  
##        am              gear            carb      
##  Min.   :0.0000   Min.   :3.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:2.000  
##  Median :0.0000   Median :4.000   Median :2.000  
##  Mean   :0.4062   Mean   :3.688   Mean   :2.812  
##  3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :1.0000   Max.   :5.000   Max.   :8.000

Data manipulation

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
cleaned_data <- mtcars %>%
    filter(mpg > 10) %>%   #filter rows
    select(mpg, cyl, disp) %>%  #select columns
    mutate(mpg_new = mpg * 2)   #create new columns

head(cleaned_data)
##                    mpg cyl disp mpg_new
## Mazda RX4         21.0   6  160    42.0
## Mazda RX4 Wag     21.0   6  160    42.0
## Datsun 710        22.8   4  108    45.6
## Hornet 4 Drive    21.4   6  258    42.8
## Hornet Sportabout 18.7   8  360    37.4
## Valiant           18.1   6  225    36.2

Data summarization

# Group data by a column and calculate the mean of another column
summary_data <- mtcars %>%
    group_by(cyl) %>%
    summarise(mean_mpg = mean(mpg, na.rm = TRUE))

summary_data
## # A tibble: 3 × 2
##     cyl mean_mpg
##   <dbl>    <dbl>
## 1     4     26.7
## 2     6     19.7
## 3     8     15.1

Data Visualization with ggplot2

# Load the library
library(ggplot2)

# Create a scatter plot with ggplot2
ggplot(mtcars, aes(x = mpg, y = disp)) +
    geom_point(color = "pink2") +
    labs(x = "mpg", y = "disp") +
    theme_minimal() +
    theme(
      axis.title.x = element_text(size = 15, face = "bold"), #change the size of x-axis title
      axis.title.y = element_text(size = 15, face = "bold"), #change the size of y-axis title
      axis.text.x = element_text(size = 12), #change the size of text of x-axis 
      axis.text.y = element_text(size = 12)  #change the size of text of y-axis 
    )

Statistical test

# Run a t-test to compare two groups
t_test_result <- t.test(disp ~ vs, data = mtcars)
t_test_result
## 
##  Welch Two Sample t-test
## 
## data:  disp by vs
## t = 5.9416, df = 26.977, p-value = 2.477e-06
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  114.3628 235.0229
## sample estimates:
## mean in group 0 mean in group 1 
##        307.1500        132.4571

Linear Regression Model

linear <- lm(disp ~ mpg, data = mtcars)
summary(linear)
## 
## Call:
## lm(formula = disp ~ mpg, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -103.05  -45.74   -8.17   46.65  153.75 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  580.884     41.740  13.917 1.26e-14 ***
## mpg          -17.429      1.993  -8.747 9.38e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 66.86 on 30 degrees of freedom
## Multiple R-squared:  0.7183, Adjusted R-squared:  0.709 
## F-statistic: 76.51 on 1 and 30 DF,  p-value: 9.38e-10
ggplot(mtcars, aes(x = mpg, y = disp)) +
    geom_point(color = "pink2") +
    geom_smooth(method = "lm", color = "black", se = FALSE) +  # Adds the linear regression line
    labs(x = "Miles per Gallon (mpg)", y = "Displacement (disp)") +
    theme_minimal() +
    theme(
      axis.title.x = element_text(size = 15, face = "bold"), 
      axis.title.y = element_text(size = 15, face = "bold"), 
      axis.text.x = element_text(size = 12), 
      axis.text.y = element_text(size = 12)  
    )
## `geom_smooth()` using formula = 'y ~ x'

Apply a function to each column in the dataframe

column_means <- sapply(mtcars, mean, na.rm = TRUE)
column_means
##        mpg        cyl       disp         hp       drat         wt       qsec 
##  20.090625   6.187500 230.721875 146.687500   3.596563   3.217250  17.848750 
##         vs         am       gear       carb 
##   0.437500   0.406250   3.687500   2.812500
column_means <- sapply(mtcars, max, na.rm = TRUE)
column_means
##     mpg     cyl    disp      hp    drat      wt    qsec      vs      am    gear 
##  33.900   8.000 472.000 335.000   4.930   5.424  22.900   1.000   1.000   5.000 
##    carb 
##   8.000
column_means <- sapply(mtcars, min, na.rm = TRUE)
column_means
##    mpg    cyl   disp     hp   drat     wt   qsec     vs     am   gear   carb 
## 10.400  4.000 71.100 52.000  2.760  1.513 14.500  0.000  0.000  3.000  1.000

Extract a pdf file

plot_linear = ggplot(mtcars, aes(x = mpg, y = disp)) +
    geom_point(color = "pink2") +
    geom_smooth(method = "lm", color = "black", se = FALSE) +  
    labs(x = "Miles per Gallon (mpg)", y = "Displacement (disp)") +
    theme_minimal() +
    theme(
      axis.title.x = element_text(size = 15, face = "bold"), 
      axis.title.y = element_text(size = 15, face = "bold"), 
      axis.text.x = element_text(size = 12), 
      axis.text.y = element_text(size = 12)  
    )

plot_linear
## `geom_smooth()` using formula = 'y ~ x'

pdf("/Users/tranchau/Documents/OMG_tutorial/dataset/save.pdf", width = 7, height = 5)
plot_linear 
## `geom_smooth()` using formula = 'y ~ x'
dev.off()
## quartz_off_screen 
##                 2