merge conflict

Merge branch 'master' of github.com:decryptr/captcha # Conflicts: # data-raw/trt.R
decryptr · Mar 1, 2023 · fd7be8b · fd7be8b
2 parents 7503651 + 64e95c7
commit fd7be8b
Show file tree

Hide file tree

Showing 54 changed files with 815 additions and 171 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -8,3 +8,5 @@
 ^_pkgdown\.yml$
 ^docs$
 ^pkgdown$
+^doc$
+^Meta$
diff --git a/.gitignore b/.gitignore
@@ -8,3 +8,5 @@ data-raw/*
 !data-raw/trt.R
 docs
 inst/doc
+/doc/
+/Meta/
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -36,6 +36,7 @@ Suggests:
     covr,
     knitr,
     rmarkdown,
-    testthat (>= 3.0.0)
+    testthat (>= 3.0.0),
+    withr
 Config/testthat/edition: 3
 VignetteBuilder: knitr
diff --git a/NAMESPACE b/NAMESPACE
@@ -6,9 +6,9 @@ S3method(decrypt,default)
 S3method(length,captcha)
 S3method(plot,captcha)
 S3method(print,captcha)
-export(available_models)
 export(captcha_accuracy)
 export(captcha_annotate)
+export(captcha_available_models)
 export(captcha_dataset)
 export(captcha_fit_model)
 export(captcha_generate)

diff --git a/R/annotate.R b/R/annotate.R
@@ -1,23 +1,34 @@
 #' @title Annotate captchas with their labels
 #'
 #' @description Given one or more Captchas, this function
-#' prompts you to solve them mannually so that later you can train
-#' a model with those labels. Annotated captchas are saved at `path`
+#' prompts the user to solve them mannually to train a model.
+#' Annotated captchas are saved at `path`
 #' with their labels in the filename separated by an underscore.
 #'
 #' @param files Either an object of class `captcha` or a character vector
 #'   with the paths to captcha files
 #' @param labels Either `NULL` (for interactive classification) or
-#'   a character vector with labels for the Captchas
+#'   a character vector with labels for the Captchas. See details.
 #' @param path Where to save the annotated captcha files.
 #'   If `NULL`, saves the files in the same folder the unanswered counterparts.
 #' @param rm_old Whether or not to delete unanswered captchas after
-#' copying and renaming them
+#' copying and renaming them.
 #'
-#' @return A character vector with the paths to the newly created files
+#' @details
+#' The `labels=`
+#' parameter can handle situations where one knows the Captcha label.
+#' For example, a workflow that uses an oracle might provide the
+#' label automatically. When the label doesn't exist,
+#' the `captcha_annotate()` function opens the prompt for classification
+#' and shows the image using `plot()`.
+#'
+#' @return A vector with the paths of the modified files.
 #'
 #' @export
-captcha_annotate <- function(files, labels = NULL, path = NULL, rm_old = FALSE) {
+captcha_annotate <- function(files,
+                             labels = NULL,
+                             path = NULL,
+                             rm_old = FALSE) {
 
   if ("captcha" %in% class(files)) {
     files <- files$path

diff --git a/R/dataset.R b/R/dataset.R
@@ -1,7 +1,28 @@
 #' File to torch tensor
 #'
-#' @param x file path
-#' @param input_dim resize image to dimension
+#' This function uses the `torchvision` package to read and transform the
+#' image in a torch tensor. The function tries to adjust the dimensions to
+#' deal with black and white or coloured images.
+#'
+#' @param x character vector with the paths to image files.
+#' @param input_dim resize image to dimension. Defaults to 32x192, which is
+#' a good default for many Captcha applications.
+#'
+#' @return torch tensor with dimensions `length(x)`x`3`x`input_dim`.
+#'
+#' @examples
+#'
+#' if (!torch::torch_is_installed()) {
+#'   torch::install_torch()
+#' }
+#'
+#' captcha_file <- fs::dir_ls(
+#'   system.file("examples/captcha/", package = "captcha"
+#' ))
+#'
+#' result <- captcha_transform_image(captcha_file)
+#' class(result)
+#' dim(result)
 #'
 #' @export
 captcha_transform_image <- function(x, input_dim = c(32L, 192L)) {
@@ -23,9 +44,27 @@ adjust_dimensions <- function(img) {
 
 #' File to response matrix (tensor)
 #'
+#' This function performs a one-hot encoding of the label, transform a label
+#' with `N` letters in a matrix of dimensions `N`x`length(vocab)`. All the
+#' labels must have the same length.
+#'
 #' @param all_letters list of tokens for all files
 #' @param vocab unique tokens
 #'
+#' @return torch tensor with dimensions `length(all_letters)`x`length(vocab)`
+#' containing only zeros and ones. All rows sum exactly one.
+#'
+#' @examples
+#'
+#' if (!torch::torch_is_installed()) {
+#'   torch::install_torch()
+#' }
+#'
+#' vocab <- letters
+#' resp <- captcha_transform_label(c("a","b","c","d","e"), vocab)
+#' class(resp)
+#' dim(resp)
+#'
 #' @export
 captcha_transform_label <- function(all_letters, vocab) {
 
@@ -48,17 +87,53 @@ captcha_transform_label <- function(all_letters, vocab) {
     torch::torch_stack()
 }
 
-#' Captcha datasets
+#' Captcha dataset
+#'
+#' This object implements a dataset using the [torch::dataset()] framework.
+#' It loads all the images in torch tensors, as well as the labels.
 #'
 #' @param root (string): root directory where the files are stored
 #' @param transform_image (callable, optional): A function/transform
 #'   that takes in an file path and returns an torch tensor prepared
-#'   to feed the model.
+#'   to feed the model. By default, uses the [captcha_transform_image()]
+#'   function.
 #' @param transform_label (callable, optional): A function/transform
-#'   that takes in the file paths and transform them.
+#'   that takes in the file paths and transform them. By default, uses the
+#'   [captcha_transform_label()] function.
 #' @param augmentation (function, optional) If not `NULL`, applies a
 #'   function to augment data with randomized preprocessing layers.
 #'
+#' This is an object of class `dataset_generator` created using
+#' [torch::dataset()] function. It has a `initialize()` method that
+#' takes a directory containing the input images,
+#' then assigns all the information in-memory with the array data
+#' structure for the response variable. It also has a `.getitem()` method that
+#' correctly extracts one observation of the dataset in this data
+#' structure, and a `.length()` method that correctly calculates the
+#' number of Captchas of the dataset.
+#'
+#' The function calculates the vocabulary based on the identified values in
+#' the dataset.
+#'
+#' @examples
+#'
+#' if (!torch::torch_is_installed()) {
+#'   torch::install_torch()
+#' }
+#'
+#' annotated_folder <- system.file(
+#'   "examples/annotated_captcha",
+#'   package = "captcha"
+#' )
+#'
+#' suppressMessages({
+#'   ds <- captcha_dataset(annotated_folder)
+#' })
+#'
+#' # gets the first item (the only item in the example)
+#' # returns a list with x and y torch tensors.
+#' ds$.getitem(1)
+#'
 #' @export
 captcha_dataset <- torch::dataset(
   name = "my_captcha",

diff --git a/R/decrypt.R b/R/decrypt.R
@@ -1,8 +1,34 @@
 #' Function to solve Captchas
 #'
-#' @param files files to read. Can be a character vector or an object of class `captcha`.
+#' Returns a label for an image using a fitted model. The image can be either a
+#' character vector (of length one or more) or an object of class `captcha`.
+#'
+#' @param files files to read. Can be either a character vector
+#'   or an object of class `captcha`.
 #' @param model model of class `luz_module_fitted`
 #'
+#' @return character vector of the predicted labels.
+#'
+#' @examples
+#'
+#' captcha_file <- system.file(
+#'   "examples/captcha/cadesp.jpg",
+#'   package = "captcha"
+#' )
+#'
+#' cap <- read_captcha(captcha_file)
+#'
+#' if (interactive()) {
+#'   plot(cap)
+#' }
+#'
+#' # the code below uses access to the internet. If you want to run locally,
+#' # download the model object from the releases site.
+#' if (interactive()) {
+#'   model <- captcha_load_model("cadesp")
+#'   decrypt(cap, model_rfb)
+#' }
+#'
 #' @name decrypt
 #' @export
 decrypt <- function(files, model) {

diff --git a/R/fit.R b/R/fit.R
@@ -1,17 +1,44 @@
 #' Fit Captcha model
 #'
+#' Provides a basic interface for fitting custom models from a fully labeled
+#' data. Annotation can be done manually using the [captcha_annotate()]
+#' function presented earlier or with another method developed by the user.
+#' The model uses a convolutional neural network architecture, similar
+#' to the LeNet-5 model.
+#'
 #' @param dir directory where the classified images are
 #' @param dir_valid (optional) directory to validation files
-#' @param prop_valid proportion of total images considered to validation. Default 0.2.
+#' @param prop_valid proportion of total images considered to validation.
+#'   Defaults to 0.2.
 #' @param dropout dropout hyperparameter. Default 0.25.
-#' @param dense_units number of dense units to use after convolution steps. Default 200.
+#' @param dense_units number of dense units to use after convolution steps.
+#'   Defaults to 200.
 #' @param decay Weight decay applied each epoch.
 #' @param batch_size Minibatch size. Default 40.
 #' @param epochs Number of epochs to use. Default 100. The model uses early
 #'   stopping, so it is possible that the procedure ends before the total
 #'   number of epochs actually run.
 #'
-#' @return fitted model of class `luz_module_fitted`
+#' @return fitted model of class `luz_module_fitted`.
+#'
+#' The modeling step has some assumptions about the file names.
+#' Images must be in a folder and have the pattern
+#' `path/to/file/<id>_<lab>.<ext>`, where:
+#' * `<id>`: can be any name, preferably without accents or other
+#'   special characters, to avoid encoding issues. It usually contains a
+#'   name for the type and a hash to identify the image uniquely.
+#'   __Note__: When annotating a file, the id must be unique, as two
+#'   Captchas can have the same label.
+#' * `<lab>`: is the Captcha label. It is a string of characters between
+#'   `[a-zA-Z0-9]`, which can be case-sensitive if necessary.
+#'   All labels must have the same length.
+#' * `<ext>`: file extension. It can be `.png`, `.jpeg` or `.jpg`.
+#'   The operations also work for the `.svg` format, but it may have
+#'   problems due to the image's transparency.
+#'
+#' An important note is that the model stops fitting after 20 iterations
+#' without significant increment of accuracy (chosen as 1%; for more
+#' details, see `vignette("advanced")`.
 #'
 #' @export
 captcha_fit_model <- function(dir,

diff --git a/R/generate.R b/R/generate.R
@@ -1,6 +1,7 @@
-#' Generate captcha
+#' Generate R-Captcha
 #'
-#' Generates random captcha image
+#' Generates a custom captcha image using the `magick` package. We name this
+#' captcha as R-Captcha.
 #'
 #' @param write_disk write image to disk? Defaults to `FALSE`.
 #' @param path path to save images. Defaults to current directory.
@@ -17,8 +18,8 @@
 #' @param p_noise probability to add random noise to image. Defaults to 40%.
 #' @param p_lat probability to add LAT algorithm to image. Defaults to 0.
 #'
-#' @return list containing two elements: imagemagick object and captcha
-#'   value.
+#' @return object of class `captcha`, which is a list containing three elements:
+#' `image-magick` object and the label.
 #'
 #' @examples
 #'
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,3 +8,5 @@ @@
     ^_pkgdown\.yml$
     ^docs$
     ^pkgdown$
+    ^doc$
+    ^Meta$