R Programming
R - basics
Background
- Open Source
- Relatively Slow
Alternatives
- Excel - Not really, but people use it anyway.
- Matlab
- Python, Perl, ...
Install R
Ubuntu:
sudo apt-get install r-base
Install R Studio
Download R Studio the IDE used to write and execute R code.
Launch interactive R, get help, and quit
- R
- help
- q
$ R
> help()
> help(class)
...
> q()
Running R on the command line using Rscript
- Rscript
print("Hello World")
Rscript hello_world.R
Comments
- Lines starting at a "#" character are comments
fname = "Foo"
lname = "Foo"
age = 42
print(fname)
# This line is a comment
# The next line is some code commented out:
# print(lname)
print(age) # We can also comment here
R and simple math operations
2 + 3 # 5
2 * 3 # 6
2 - 3 # -1
2 / 3 # 0.6666667
2 ^ 3 # 8
R variables
width = 8
height = 3
area_of_recangle = height * width
print(area_of_recangle) # 24
R assignment (left-assignment, right-assignment)
- Both = and <- can be used for left-assignment
- -> Can be used for right-assignment
x = 1
y <- 2
3 -> z
print(x) # 1
print(y) # 2
print(z) # 3
Variable types (numeric, character, logical, function)
-
class
-
isa
-
numeric
-
character
-
logical
-
function
-
numeric: 1 34 2.7
-
character: "hello" "23"
-
logical: TRUE FALSE
-
function: class print
class(23) # numeric
class(2.3) # numeric
class(NaN) # numeric
class("Hello") # character
class("23") # character
class(TRUE) # logical
class(FALSE) # logical
class(T) # logical
class(F) # logical
class(NA) # logical
class(class) # function
class(print) # function
isa(2, "numeric") # TRUE
isa("2", "character") # TRUE
isa(FALSE, "logical") # TRUE
Variable types are deducted
- class
- isa
- numeric
- character
- logical
x = 23
class(x) # numeric
x = "George"
class(x) # character
x = T
class(x) # logical
isa(2, "numeric") # TRUE
isa("2", "character") # TRUE
isa(FALSE, "logical") # TRUE
paste (join) strings and numbers together
- paste
paste("Hello", "World", "!") # "Hello World !"
name = "Foo"
age = 42
city = "Budapest"
paste(name, age, city) # "Foo 42 Budapest"
paste(name, age, city, sep="_") # "Foo_42_Budapest"
Operator preference order and parentheses
Operators comparing numbers
x = 2
y = 3
z = 3
x > y # FALSE
x < y # TRUE
x <= y # TRUE
x >= y # FALSE
x == y # FALSE
x != y # TRUE
y == z # TRUE
Operators comparing strings (characters)
- characters are compared in ABC order (but not ASCII order!)
"a" < "b" # TRUE
"a" < "A" # TRUE
"abc" < "abd" # TRUE
2 < 11 # TRUE
"2" < "11" # FALSE
2 < "a" # TRUE
Convert string (character) to numeric
- as.numeric
x = "2"
class(x) "character"
y = as.numeric(x)
class(y) "numeric"
Boolean (logical) operations
TRUE & TRUE # TRUE
TRUE & FALSE # FALSE
TRUE | FALSE # TRUE
FALSE | FALSE # FALSE
! TRUE # FALSE
! FALSE # TRUE
# Logical Operations
a = TRUE
b = FALSE
a & b # logical AND FALSE
a | b # logical OR TRUE
! a # logical NOT FALSE
x = 2
y = 4
x & y # TRUE
x & 0 # FALSE
print(2 & 4) # TRUE
print(2 & 0) # FALSE
print(2 | 0) # TRUE
Concatenate strings
x = "Foo"
y = "Bar"
# x + y
# Error in x + y : non-numeric argument to binary operator
# Execution halted
z = paste(x, y, sep="")
z # FooBar
length(z) # 1
# Join elements of a vector
fruits = c("Apple", "Banana")
q = paste(fruits, collapse ="-")
q # Apple-Banana
# join elements of a numeric vector
numbers = c(2, 3, 4)
class(numbers) # numeric
nums = paste(numbers, collapse ="-")
nums # 2-3-4
Convert between types using as.
- as.factor
Printing with cat
cat("Hello", "World", "!", end="\n")
Vectors
R vectors
-
c
-
c stands for concatenate
x = c(2, 5, 1, 19)
print(x)
y = x + 2
print(y)
z = x * 2
print(z)
lg = log(x)
print(lg)
{% embed include file="src/examples/vectors/vector.out)
Variable types of vectors are deducted
-
We'll learn a lot more about vectors later
-
TODO: Mixed data?
y = c(2, 7, 3)
class(y) # numeric
the_truth = c(TRUE, FALSE, TRUE, TRUE, FALSE)
class(the_truth) # logical
length(the_truth) # 5
One element vector is the same a single value
name = "foo"
names = c("foo")
print(name == names)
class(name)
class(names)
print(name[1])
print(names[1])
{% embed include file="src/examples/vectors/one_element_vector.out)
Sum of values in vectors
y = c(2, 7, 3)
sum(y) # 12
sum(TRUE) # 1
sum(FALSE) # 0
the_truth = c(TRUE, FALSE, TRUE, TRUE, FALSE)
sum(the_truth) # 3
Size of vector (length of vector)
distances = c(11, 12, 13, 14)
length(distances) # 4
Access the n-the element of a vector
- It is
some_vector[n]
- It is 1-based
distances = c(11, 12, 13, 14)
distances[1] # 11
distances[4] # 14
distances[5] # NA
R Vector: Negative index, exclude element
Using a negative index will give you the same vector without that element
distances = c(11, 12, 13, 14)
nd = distances[-2]
print(nd) # 11 13 14
print(distances) # 11 12 13 14
R vector: Access several elements (slice)
- We can use a vector as the index of anther vector thereby fetching a new vector of the specific values.
- We can also select a range of elements using :
distances = c(11, 12, 13, 14, 15, 16, 17, 18)
print(distances[c(2, 4)]) # 12, 14
print(distances[2:4]) # 12, 13, 14
locations = c(5, 3, 5, 7)
nd = distances[locations]
print(nd) # 15, 13, 15, 17
print(distances)
R vector: Exclude several elements
distances = c(11, 12, 13, 14, 15, 16, 17, 18)
print(distances[-c(2, 4)]) # 11 13 15 16 17 18
print(distances[-(2:4)]) # 11 15 16 17 18
Some basic statistical functions
- median
- sum
- mean
- min
- max
distances = c(11, 12, 13, 14, 15, 16, 17, 180, 7, 11)
sum(distances)
median(distances)
mean(distances) # average
min(distances)
max(distances)
var(distances) # variance
st(distances) # standard deviation
3 ways to create vectors
- c
- seq
- :
x = 1:3
y = seq(1, 3, 1)
z = c(1, 2, 3)
print(x)
print(y)
print(z)
print(x == y)
print(x == z)
print(y == z)
# 1 2 3
# 1 2 3
# 1 2 3
# TRUE TRUE TRUE
# TRUE TRUE TRUE
# TRUE TRUE TRUE
R - sequences and ranges
- seq
nums = seq(1, 10, 2)
print(nums) # 1 3 5 7 9
print(class(nums)) # "numeric"
print(is.numeric(nums)) # TRUE
other = seq(to = 19, from = 7, by = 3)
print(other) # 7 10 13 16 19
Range of numbers
- :
range_of_numbers = 1:10
print(range_of_numbers) # 1 2 3 4 5 6 7 8 9 10
Filter values
values = c(17, 3, 28, 5)
large_logic = values > 10
large = values[large_logic]
print(large) # 17, 28
n = c(2, 4, 7, 6, 9, 8)
n[ n > 4 ]
n[ n > 4 & n %% 2] # grater than for and non-zero modulus 2
# n[ some-logical-vector ] to filter certain values and create a shorter vector
n[ n >= 4 ]
Index of true elements - which
- which
# the index of the elements that have TRUE value
which(c(TRUE, FALSE, FALSE, TRUE)) # 1, 4
which
Showing the indexes of the elements that are TRUE in a boolean vector
n = c(2, 4, 7, 6, 9, 8)
which(n > 4)
which(n > 4 & n %% 2)
Matix TBD
# Show a matrix
Vector operations - reuse values from shorter vector
x = c(1, 2, 3, 4)
y = c(10, 20)
x+y
# There is no warning here !
x = c(1, 2, 3, 4)
y = c(10, 20)
print(x*y) # 10 40 30 80
# 1 * 10
# 2 * 20
# 3 * 10
# 4 * 20
# R starts to reuse the shorter vector if one of the vactors is longer than the other
# gives a warning
# TODO: can we stop running at such warnings?
a = c(TRUE, TRUE, FALSE)
b = c(TRUE, FALSE, TRUE, FALSE)
a & b
a | b
! a
x = c(1, 2, 3)
y = c(10, 20)
x+y
# options(warn=2) # turn warnings into errors
# options(warn=0) # ???? turn warnings back to warnings
Summary of numeric data
- summary
- mean
nums = c(2, 4, 5)
sum(nums)
mean(nums)
max(nums)
min(nums)
median(nums)
summary(nums)
bools = c(T, F, T, T, F)
summary(bools)
mean(bools)
max(bools)
min(bools)
sum(bools)
names = c("biology", "chemistry", "physics")
summary(names)
#mean(names)
max(names)
min(names)
#sum(names)
Change element of vector
colors = c("red", "blue", "green")
colors # "red", "blue", "green"
colors[1] # red
colors[1] = "purple"
colors # "purple", "blue", "green"
Assign vector to another name
colors = c("red", "blue", "green")
colors # "red", "blue", "green"
other_colors = colors
colors[1] # red
colors[1] = "purple"
colors # "purple", "blue", "green"
other_colors # "purple", "blue", "green"
Reverse vector
- rev
- reverse
fruits = c("apple", "peach", "lemon", "banana", "ananas")
rev(fruits) # "ananas" "banana" "lemon" "peach" "apple"
Sort vector
- sort
numbers = c(7, 3, 9, 11, 2)
sort(numbers) # 2 3 7 9 11
fruits = c("apple", "peach", "lemon", "banana", "ananas")
sort(fruits) # "ananas" "apple" "banana" "lemon" "peach"
sort(fruits, decreasing=TRUE) # "peach" "lemon" "banana" "apple" "ananas"
Sort using order
- order
numbers = c(7, 3, 9, 11, 2)
order(numbers) # 5 2 1 3 4
numbers[order(numbers)] # 2 3 7 9 11
fruits = c("apple", "peach", "lemon", "banana", "ananas")
order(fruits) # 5 1 4 3 2
fruits[ order(fruits) ] # "ananas" "apple" "banana" "lemon" "peach"
fruits[ order(fruits, decreasing = T) ] # "peach" "lemon" "banana" "apple" "ananas"
Operators comparing vectors
# Show how to compare a vector with an individual value (=> this just plays on the shorter being repeated to the length of the longer)
x = c(1, 2)
y = c(11, 0)
x > y
x < y
x <= y
x >= y
x == y
x != y
Convert vector of strings to numerics
- as.numeric
chars = c("1", "2", "3")
nums = as.numeric(chars)
print(class(chars))
print(class(nums))
print(summary(chars))
print(summary(nums))
print(sum(nums)) # 6
print(sum(chars)) # invalid 'type' (character) of argument
Repeate the same number
- rep
same_number = rep(2, 10)
print(same_number) # 2 2 2 2 2 2 2 2 2 2
Boolean (logical) operations on vectors
a = c(TRUE, TRUE, FALSE, FALSE)
b = c(TRUE, FALSE, TRUE, FALSE)
sum(a)
sum(b) # number of TRUE items
a & b
a | b
! a
a = c(TRUE, TRUE, FALSE, TRUE)
b = c(TRUE, FALSE, TRUE, FALSE)
c = c(TRUE, FALSE, FALSE, TRUE)
a & b
a | b
! a
# operator precedence
# the use of parentheses
a & (b | c)
Factors
- "Category", "enumerated type"
m = c("apple", "apple", "banana", "apple", "peach", "banana", "apple")
m
f = as.factor(m)
summary(m)
summary(f)
levels(f)
# levels - possible values a variable iny can have
fruits_vector = c("Apple", "Banana", "Apple", "Apple", "Peach", "Banana")
fruits_factor = as.factor(fruits_vector)
fruits_vector # [1] "Apple" "Banana" "Apple" "Apple" "Peach" "Banana"
fruits_factor # [1] Apple Banana Apple Apple Peach Banana
# Levels: Apple Banana Peach
class(fruits_vector) # "character"
class(fruits_factor) # "factor"
length(fruits_vector) # 6
length(fruits_factor) # 6
levels(fruits_vector) # NULL
levels(fruits_factor) # "Apple" "Banana" "Peach"
fruits_vector["Apple"] # NA
fruits_factor["Apple"] # [1] <NA>
# Levels: Apple Banana Peach
Append to end of vector
animals = c("cat", "dog")
length(animals)
animals = append(animals, "mouse")
length(animals)
Data Frames
Data Frame functions
data.frames dim head nrow ncol names - get the names or assign new names to the data.frame names(dataframe) = c("title1", "title2", ...) summary Subsetting (indexing ranges) subset() order() returns a numeric vector of the sorted indexes
Iris dataset
Built-in Iris data set
library(datasets)
#write.table(file="a.txt", iris, sep="\t")
iris
filename = "/home/gabor/Dropbox/Weizmann/R/iris_dataset.txt"
# setwd - set working directory
# getcwd - get working directory
ir = read.table(filename, sep="\t", header=T)
head(ir, 3)
tail(ir, 3)
View(ir) # Capital V !!! (opens a separate view of the data)
class(ir) # data.frame
dim(ir) # 150 5 dimensions: (rows, columns)
ir[2, 3]
head(ir)
# The indexes on the left hand side are also called "row-names"
ir[,1] # access a column by its location
ir$seplen # access a column by its name
ir1 = ir[, c("seplen", "sepwid")] # dataframe
head(ir1)
column = ir[, c("seplen")] # numberic vector
summary(ir)
Load Iris dataset
iris = read.table(file="data/iris.txt", sep=" ", header=T, stringsAsFactors=T)
head(iris)
# hist(iris$Sepal.Length)
hist(iris$Petal.Width)
mean_petal_width = mean(iris$Petal.Width)
hist(
x=iris$Petal.Width,
breaks=10,
col="light blue",
main="Distribution of Petal Width",
xlab="Width of Petal (cm)",
#ylab="",
sub=paste("Mean", mean_petal_width), # paste concatentes values
)
#help(hist)
#View(iris)
#tail(iris)
# iris
#class(iris)
#dim(iris) # rows, columns
# iris[2, 4]
# iris[1, ]
# iris[, 1]
#iris$Sepal.Length
#iris['Sepal.Length'] # numeric vector
#iris[c('Sepal.Length', 'Petal.Length')] # data.frame
#iris[,c('Sepal.Length', 'Petal.Length')] # data.frame
#summary(iris)
colnames(iris)
plot(iris$Petal.Length, iris$Sepal.Length)
help(plot) # or ?plot
plot(
x=iris$Petal.Length,
y=iris$Sepal.Length,
#type="l", # or p for points
#pch=20, # change how the points look like
# col (short for color)
#col="purple red",
col=iris$Species,
#pch=19
xlab="X Title",
ylab="Y Title",
main="Main heading",
sub="Sub heading",
)
# In the iris dataset when we draw a histogram how does it pick colors when we only have names?
# Because it is a factor (beacise of the stringsAsFactors)
class(iris$Species) # "factor"
# Each "level" in the factor has a numerical value (1, 2, 3) and in R each color also a number (1 = black 2 = red, etc)
plot(
x=iris$Petal.Length,
y=iris$Sepal.Length,
pch=as.numeric(iris$Species),
)
# TODO: how to pick the specific colors?
# how to color the point accorind to some other condition? e.g. iris$Sepal.width > 2
plot(
x=iris$Petal.Length,
y=iris$Sepal.Length,
col=as.numeric(iris$Sepal.Width > 3)+1
)
# iris$Sepal.Width > 2 is a boolean vector
# as.numeric(iris$Sepal.Width > 2) is a numerical vector of 0 and 1 values
# careful: color 0 is white so we won't see it, that's why we add 1 so instead of 0 and 1 we will get 1 and 2 values.
levels(iris$Species) # "setosa" "versicolor" "virginica"
help(data.frame)
pairs(iris[,1:4]) # pairwise relation graphs
pairs(iris[,1:4], col=as.numeric(iris$Species))
pairs(iris[,1:4], col=as.numeric(iris$Species), upper.panel=NULL)
Sort dataset by a column
iris = read.table(file="data/iris.txt", sep=" ", header=T, stringsAsFactors=T)
head(iris)
ord = order(iris$Sepal.Length)
head(iris[ord,])
Order dataframe by two columns (secondary sort)
df = data.frame(A=c(2, 1, 1, 3), B=c(2, 4, 3, 1))
df
df[order(df$A),]
df[order(df$B),]
df[order(df$A, df$B),]
Strings (characters)
Substitute first occurence using sub
- sub
text <- "One cat two cats and another single cat"
print(text)
print(sub('cat', 'dog', text))
print(text)
[1] "One cat two cats and another single cat"
[1] "One dog two cats and another single cat"
[1] "One cat two cats and another single cat"
Substitute all occurences using gsub
- gsub
text <- "One cat two cats and another single cat"
print(text)
print(gsub('cat', 'dog', text))
print(text)
[1] "One cat two cats and another single cat"
[1] "One dog two dogs and another single dog"
[1] "One cat two cats and another single cat"
Functions
Functions overview
function_name = function(arg1, arg2, arg3) {
# commands
return(someReturnValue)
}
-
Function parameter names are local to the function
-
variables created in the function are local
-
Result of last expression returned implicitely (even without calling return())
-
When accepting a vector of numbers what if the user sends in a vector of strings (different class of data)?
-
What if some of the values are missing? (NA)
-
What if the vector is empty?
Simple add function
rectangle_area=function(a, b) {
area = a*b
return(area)
}
rectangle_area(2, 3)
rectangle_area(4, 7)
# area - not defined here
a = c(0, 0)
b = c(3, 4)
distance=function(a, b) {
xdiff = abs(a[1]-b[0])
print(xdiff)
ydiff = abs(a[1]-b[1])
print((xdiff**2 + ydiff**2)**0.5)
}
distance(a, b)
Recursive Fibonacci
fibonacci=function(n) {
if (n == 1) {
return(1)
}
if (n == 2) {
return(1)
}
return(fibonacci(n-1) + fibonacci(n-2))
}
for (n in 1:10) {
cat(fibonacci(n), end="\n")
}
Fibonacci
fibonacci=function(n) {
if (n == 1) {
return(1)
}
if (n == 2) {
return(1)
}
fibs = c(1, 1)
for (i in 3:n) {
last = fibs[length(fibs)] + fibs[length(fibs)-1]
fibs = append(fibs, last)
}
return(fibs)
}
cat(fibonacci(10), end="\n")
Testing
Testthat
.libPaths(append(.libPaths(), "lib"))
install.packages("testthat", "lib")
library("testhat")
Test Example
add=function(x, y) {
return(x+y)
}
multiply=function(x, y) {
return(x*y)
}
Using the functions
.libPaths(append(.libPaths(), "lib"))
install.packages(setdiff("this.path", rownames(installed.packages())), "lib")
library("this.path")
root = dirname(this.path())
source(file.path(root, "mymath.R"))
if (add(2, 3) != 5) {
exit(1)
}
if (add(2, -2) != 0) {
exit(1)
}
Files
Filesystem pathes
- dirname
- basename
basename("/home/foobar/projects/my_r_project/main.R") # "main.R"
dirname("/home/foobar/projects/my_r_project/main.R") # "/home/foobar/projects/my_r_project"
Filesystem get current working directory (cwd)
- getwd
- cwd
getwd() # "/home/gabor/work/slides/r"
Concatenate pathes
- file.path
file.path("usr", "local", "lib") # "usr/local/lib"
Change directory
- getwd
- setwd
getwd()
setwd("examples") # change directory
getwd()
setwd("/home/gabor/work/slides/r")
getwd()
Recursively list files and directories
- list.dirs
- list.files
list.dirs(".", recursive=FALSE) # recursive defaults to TRUE
list.files(".", recursive=TRUE) # recursive defaults to FALSE
Write to a textfile
- file
- cat
filename = "sample.txt"
print(filename)
fh <- file(filename, "w")
cat("TITLE line", "First row", "", "Third row", sep = "\n", file = fh)
cat("One more line\n", file = fh)
close(fh)
Read a textfile
filename = "README.md"
readLines(filename)
Counter with file storage
filename = "count.txt"
count <- 0
if (file.exists(filename)) {
count <- as.numeric(readLines(filename))
}
count <- count + 1
cat(count, end="\n")
fh <- file(filename, "w")
cat(count, end="\n", file = fh)
close(fh)
Graphs
Bar plot
- barplot
distances = c(11, 15, 7, 23, 9)
barplot(distances)
barplot(c(2, 3, 7, 1))
data <- rnorm(100)
barplot(data)
barplot(c(2, 3, 7, 1))
data <- rnorm(100)
barplot(data)
Associate name with each value
fruits = c("Apple", "Banana", "Peach")
nums = c(3, 7, 5)
names(nums) = fruits
print(nums["Peach"])
#print(class(nums["Peach"]))
{% embed include file="src/examples/graphs/associate_name.out)
barplot this!
Quick graphs
- barplot
- plot
- hist
- boxplot
- pie
distances = c(11, 15, 7, 23, 9)
barplot(distances)
plot(distances)
hist(distances)
boxplot(distances)
pie(distances)
barplot(distances, col="purple")
barplot(distances, col="#2323AA")
barplot(distances, col="#2323AA", ylab="Text on Y", xlab="Text on X")
- Will create a file called Rplots.pdf.
Random numbers in normal distribution
-
rnorm
-
hist
-
Normal distribution
-
Histogram of the numbers
normal_numbers = rnorm(100)
hist(normal_numbers)
Packages
CRAN
- CRAN - The Comprehensive R Archive Network
Install CRAN packages
$ R
> install.packages("csv", "lib")
This will install the "csv" package in the "lib" folder. It will also install all the dependencies.
Install CRAN packages from the command line
Rscript -e 'install.packages("RUnit")'
Install several CRAN packages from the command line
Rscript -e 'install.packages(c("RUnit", "zoo"))'
Read CSV file
.libPaths(append(.libPaths(), "lib"))
install.packages("csv", "lib")
library("csv")
filename = "a.csv"
table = read.table(filename)
Read JSON file
-
JSON
-
read_json
.libPaths(append(.libPaths(), "lib"))
install.packages("jsonlite", "lib")
library("jsonlite")
filename = "r.json"
data = read_json(filename)
Other
Exit - quit
- quit
![](examples/other/exit.R" %}
Script name
![](examples/other/script_name.R" %}
Command line arguments
![](examples/other/command_line_arguments.R" %}
if-statement
- if
- else
![](examples/other/if.R" %}
for loop
- for
- in
![](examples/other/for.R" %}