--- title: "rvestTutorial" author: "Kathleen Durant" date: "October 15, 2017" output: pdf_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` ## R Markdown This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see . When you click the **Knit** button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this: ```{r getdata} library(rvest) library(magrittr) rpack_html <- "http://cran.r-project.org/web/packages" %>% read_html() rpack_html %>% class() rpack_html %>% xml_structure(indent = 1) ``` ```{r} rpack_html %>% html_text() %>% cat() ``` ```{r} rpack_html %>% html_node(xpath="//p/a[contains(@href, 'views')]/..") ``` ```{r} rpack_html %>% # all links on page html_nodes(xpath="//a") %>% html_attr("href") %>% .[1:6] rpack_html %>% # all links on page html_nodes(xpath="//a") %>%html_attr("href") %>% extract(1:6) ``` ```{r} ``` ```{r} library(rvest) library(httr) text <- "Quirky spud boys can jam after zapping five worthy Polysixes." mainpage <- read_html("http://read-able.com") mainpage %>% html_nodes(xpath="//form") %>% html_attrs() ``` ```{r} library(httr) # html operations #GET: fetch an existing resource. The URL contains all the necessary information the server needs to locate and return the resource. #POST: create a new resource. POST requests usually carry a payload that specifies the data for the new resource. #PUT: update an existing resource. The payload may contain the updated data for the resource. #DELETE: delete an existing resource. r <- GET("http://httpbin.org/get") r status_code(r) #> [1] 200 headers(r) str(content(r)) r <- GET("http://httpbin.org/get") # Get an informative description: http_status(r) r$status_code # 200 is success content(r, "text") # can extract cookies that persist content(r, "text", encoding = "ISO-8859-1") r <- GET("http://httpbin.org/cookies/set", query = list(a = 1)) cookies(r) # passing arguments to a website to extract specific information r <- GET("http://httpbin.org/get", query = list(key1 = "value1", key2 = "value2")) # add custom header r <- GET("http://httpbin.org/get", add_headers(Name = "Hadley")) str(content(r)$headers) content(r)$args r <- GET("http://httpbin.org/cookies", set_cookies("MeWant" = "cookies")) content(r)$cookies # when posting you can include data in the body r <- POST("http://httpbin.org/post", body = list(a = 1, b = 2, c = 3)) # you can set the encoding url <- "http://httpbin.org/post" body <- list(a = 1, b = 2, c = 3) # Form encoded r <- POST(url, body = body, encode = "form") # Multipart encoded r <- POST(url, body = body, encode = "multipart") # JSON encoded r <- POST(url, body = body, encode = "json") # you can also upload files #POST(url, body = upload_file("mypath.txt")) #POST(url, body = list(x = upload_file("mypath.txt"))) ``` #