## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(collapse = TRUE, comment = "#>", eval = FALSE)

## ----setup--------------------------------------------------------------------
# library(crawlee)

## -----------------------------------------------------------------------------
# result <- crawler("https://books.toscrape.com/") |>
#   cr_on_html(function(ctx) {
#     ctx$push_data(list(
#       url   = ctx$request$url,
#       title = ctx$page |> rvest::html_element("title") |> rvest::html_text2()
#     ))
#   }) |>
#   cr_run() |>
#   cr_collect()
# 
# result

## -----------------------------------------------------------------------------
# crawler("https://books.toscrape.com/") |>
#   cr_on_html(function(ctx) {
#     ctx$push_data(list(url = ctx$request$url))
#     ctx$enqueue_links() # follow every same-domain link
#   }) |>
#   cr_options(max_requests = 50) |>
#   cr_run()

## -----------------------------------------------------------------------------
# crawler("https://books.toscrape.com/") |>
#   cr_options(max_depth = 3, max_requests = 200) |>
#   cr_on_html(function(ctx) {
#     ctx$push_data(list(url = ctx$request$url, depth = ctx$request$depth))
#     ctx$enqueue_links(
#       glob    = "*/catalogue/*", # only follow catalogue pages
#       exclude = "*/category/*"
#     )
#   }) |>
#   cr_run() |>
#   cr_collect()

## -----------------------------------------------------------------------------
# books <- crawler("https://books.toscrape.com/") |>
#   # listing pages: enqueue book detail pages, labelled "book"
#   cr_on_html(function(ctx) {
#     ctx$enqueue_links(glob = "*/catalogue/*index.html", label = "book")
#     ctx$enqueue_links(glob = "*/page-*.html") # pagination, default handler
#   }) |>
#   # detail pages
#   cr_on_html(label = "book", function(ctx) {
#     ctx$push_data(list(
#       title = ctx$page |> rvest::html_element("h1") |> rvest::html_text2(),
#       price = ctx$page |> rvest::html_element(".price_color") |> rvest::html_text2()
#     ))
#   }) |>
#   cr_run() |>
#   cr_collect()
# 
# books

## -----------------------------------------------------------------------------
# crawler() |>
#   cr_from_sitemap("https://books.toscrape.com/sitemap.xml", label = "book") |>
#   cr_on_html(label = "book", function(ctx) {
#     ctx$push_data(list(url = ctx$request$url))
#   }) |>
#   cr_run() |>
#   cr_collect()

## -----------------------------------------------------------------------------
# crawler("https://example.com") |>
#   cr_use_browser(wait_selector = ".content") |>
#   cr_on_html(function(ctx) {
#     ctx$push_data(list(url = ctx$request$url))
#     ctx$screenshot()
#   }) |>
#   cr_run()