Skip to content

Commit

Permalink
add support for using cookie file
Browse files Browse the repository at this point in the history
  • Loading branch information
Sunshine authored and snshn committed Jan 13, 2024
1 parent 20c56a5 commit 78c3795
Show file tree
Hide file tree
Showing 14 changed files with 465 additions and 33 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -137,9 +137,10 @@ cat index.html | monolith -aIiFfcMv -b https://original.site/ - > result.html
- `-b`: Use custom `base URL`
- `-B`: Forbid retrieving assets from specified domain(s)
- `-c`: Exclude CSS
- `-C`: Save document using custom `charset`
- `-C`: Read cookies from `file`
- `-d`: Allow retrieving assets only from specified `domain(s)`
- `-e`: Ignore network errors
- `-E`: Save document using custom `encoding`
- `-f`: Omit frames
- `-F`: Exclude web fonts
- `-i`: Remove images
Expand Down
119 changes: 119 additions & 0 deletions src/cookies.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
use std::time::{SystemTime, UNIX_EPOCH};
use url::Url;

pub struct Cookie {
pub domain: String,
pub include_subdomains: bool,
pub path: String,
pub https_only: bool,
pub expires: u64,
pub name: String,
pub value: String,
}

#[derive(Debug)]
pub enum CookieFileContentsParseError {
InvalidHeader,
}

impl Cookie {
pub fn is_expired(&self) -> bool {
if self.expires == 0 {
return false; // Session, never expires
}

let start = SystemTime::now();
let since_the_epoch = start
.duration_since(UNIX_EPOCH)
.expect("Time went backwards");

self.expires < since_the_epoch.as_secs()
}

pub fn matches_url(&self, url: &str) -> bool {
match Url::parse(&url) {
Ok(url) => {
// Check protocol scheme
match url.scheme() {
"http" => {
if self.https_only {
return false;
}
}
"https" => {}
_ => {
// Should never match URLs of protocols other than HTTP(S)
return false;
}
}

// Check host
if let Some(url_host) = url.host_str() {
if self.domain.starts_with(".") && self.include_subdomains {
if !url_host.to_lowercase().ends_with(&self.domain)
&& !url_host
.eq_ignore_ascii_case(&self.domain[1..self.domain.len() - 1])
{
return false;
}
} else {
if !url_host.eq_ignore_ascii_case(&self.domain) {
return false;
}
}
} else {
return false;
}

// Check path
if !url.path().eq_ignore_ascii_case(&self.path)
&& !url.path().starts_with(&self.path)
{
return false;
}
}
Err(_) => {
return false;
}
}

true
}
}

pub fn parse_cookie_file_contents(
cookie_file_contents: &str,
) -> Result<Vec<Cookie>, CookieFileContentsParseError> {
let mut cookies: Vec<Cookie> = Vec::new();

for (i, line) in cookie_file_contents.lines().enumerate() {
if i == 0 {
// Parsing first line
if !line.eq("# HTTP Cookie File") && !line.eq("# Netscape HTTP Cookie File") {
return Err(CookieFileContentsParseError::InvalidHeader);
}
} else {
// Ignore comment lines
if line.starts_with("#") {
continue;
}

// Attempt to parse values
let mut fields = line.split("\t");
if fields.clone().count() != 7 {
continue;
}
cookies.push(Cookie {
domain: fields.next().unwrap().to_string().to_lowercase(),
include_subdomains: fields.next().unwrap().to_string() == "TRUE",
path: fields.next().unwrap().to_string(),
https_only: fields.next().unwrap().to_string() == "TRUE",
expires: fields.next().unwrap().parse::<u64>().unwrap(),
name: fields.next().unwrap().to_string(),
value: fields.next().unwrap().to_string(),
});
}
}

Ok(cookies)
}
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
pub mod cookies;
pub mod css;
pub mod html;
pub mod js;
Expand Down
39 changes: 32 additions & 7 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ use std::process;
use std::time::Duration;
use url::Url;

use monolith::cookies::parse_cookie_file_contents;
use monolith::html::{
add_favicon, create_metadata_tag, get_base_url, get_charset, has_favicon, html_to_dom,
serialize_document, set_base_url, set_charset, walk_and_embed_assets,
Expand Down Expand Up @@ -64,7 +65,7 @@ pub fn read_stdin() -> Vec<u8> {
}

fn main() {
let options = Options::from_args();
let mut options = Options::from_args();

// Check if target was provided
if options.target.len() == 0 {
Expand All @@ -74,10 +75,10 @@ fn main() {
process::exit(1);
}

// Check if custom charset is valid
if let Some(custom_charset) = options.charset.clone() {
if !Encoding::for_label_no_replacement(custom_charset.as_bytes()).is_some() {
eprintln!("Unknown encoding: {}", &custom_charset);
// Check if custom encoding is valid
if let Some(custom_encoding) = options.encoding.clone() {
if !Encoding::for_label_no_replacement(custom_encoding.as_bytes()).is_some() {
eprintln!("Unknown encoding: {}", &custom_encoding);
process::exit(1);
}
}
Expand Down Expand Up @@ -139,6 +140,30 @@ fn main() {
},
};

// Read and parse cookie file
if let Some(opt_cookie_file) = options.cookie_file.clone() {
match fs::read_to_string(opt_cookie_file) {
Ok(str) => match parse_cookie_file_contents(&str) {
Ok(cookies) => {
options.cookies = cookies;
// for c in &cookies {
// // if !cookie.is_expired() {
// // options.cookies.append(c);
// // }
// }
}
Err(_) => {
eprintln!("Could not parse specified cookie file");
process::exit(1);
}
},
Err(_) => {
eprintln!("Could not read specified cookie file");
process::exit(1);
}
}
}

// Initialize client
let mut cache = HashMap::new();
let mut header_map = HeaderMap::new();
Expand Down Expand Up @@ -315,8 +340,8 @@ fn main() {
}

// Save using specified charset, if given
if let Some(custom_charset) = options.charset.clone() {
document_encoding = custom_charset;
if let Some(custom_encoding) = options.encoding.clone() {
document_encoding = custom_encoding;
dom = set_charset(dom, document_encoding.clone());
}

Expand Down
48 changes: 28 additions & 20 deletions src/opts.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
use clap::{App, Arg, ArgAction};
use std::env;

use crate::cookies::Cookie;

#[derive(Default)]
pub struct Options {
pub no_audio: bool,
pub base_url: Option<String>,
pub blacklist_domains: bool,
pub no_css: bool,
pub charset: Option<String>,
pub cookie_file: Option<String>,
pub cookies: Vec<Cookie>,
pub domains: Option<Vec<String>>,
pub ignore_errors: bool,
pub encoding: Option<String>,
pub no_frames: bool,
pub no_fonts: bool,
pub no_images: bool,
Expand Down Expand Up @@ -48,13 +52,13 @@ impl Options {
.version(env!("CARGO_PKG_VERSION"))
.author(format!("\n{}\n\n", env!("CARGO_PKG_AUTHORS").replace(':', "\n")).as_str())
.about(format!("{}\n{}", ASCII, env!("CARGO_PKG_DESCRIPTION")).as_str())
.args_from_usage("-a, --no-audio 'Removes audio sources'")
.args_from_usage("-b, --base-url=[http://localhost/] 'Sets custom base URL'")
.args_from_usage("-a, --no-audio 'Remove audio sources'")
.args_from_usage("-b, --base-url=[http://localhost/] 'Set custom base URL'")
.args_from_usage(
"-B, --blacklist-domains 'Treat list of specified domains as blacklist'",
)
.args_from_usage("-c, --no-css 'Removes CSS'")
.args_from_usage("-C, --charset=[UTF-8] 'Enforces custom encoding'")
.args_from_usage("-c, --no-css 'Remove CSS'")
.args_from_usage("-C, --cookies=[cookies.txt] 'Specify cookie file'")
.arg(
Arg::with_name("domains")
.short('d')
Expand All @@ -65,23 +69,24 @@ impl Options {
.help("Specify domains to use for white/black-listing"),
)
.args_from_usage("-e, --ignore-errors 'Ignore network errors'")
.args_from_usage("-f, --no-frames 'Removes frames and iframes'")
.args_from_usage("-F, --no-fonts 'Removes fonts'")
.args_from_usage("-i, --no-images 'Removes images'")
.args_from_usage("-I, --isolate 'Cuts off document from the Internet'")
.args_from_usage("-j, --no-js 'Removes JavaScript'")
.args_from_usage("-k, --insecure 'Allows invalid X.509 (TLS) certificates'")
.args_from_usage("-M, --no-metadata 'Excludes timestamp and source information'")
.args_from_usage("-E, --encoding=[UTF-8] 'Enforce custom charset'")
.args_from_usage("-f, --no-frames 'Remove frames and iframes'")
.args_from_usage("-F, --no-fonts 'Remove fonts'")
.args_from_usage("-i, --no-images 'Remove images'")
.args_from_usage("-I, --isolate 'Cut off document from the Internet'")
.args_from_usage("-j, --no-js 'Remove JavaScript'")
.args_from_usage("-k, --insecure 'Allow invalid X.509 (TLS) certificates'")
.args_from_usage("-M, --no-metadata 'Exclude timestamp and source information'")
.args_from_usage(
"-n, --unwrap-noscript 'Replaces NOSCRIPT elements with their contents'",
"-n, --unwrap-noscript 'Replace NOSCRIPT elements with their contents'",
)
.args_from_usage(
"-o, --output=[document.html] 'Writes output to <file>, use - for STDOUT'",
"-o, --output=[document.html] 'Write output to <file>, use - for STDOUT'",
)
.args_from_usage("-s, --silent 'Suppresses verbosity'")
.args_from_usage("-t, --timeout=[60] 'Adjusts network request timeout'")
.args_from_usage("-u, --user-agent=[Firefox] 'Sets custom User-Agent string'")
.args_from_usage("-v, --no-video 'Removes video sources'")
.args_from_usage("-s, --silent 'Suppress verbosity'")
.args_from_usage("-t, --timeout=[60] 'Adjust network request timeout'")
.args_from_usage("-u, --user-agent=[Firefox] 'Set custom User-Agent string'")
.args_from_usage("-v, --no-video 'Remove video sources'")
.arg(
Arg::with_name("target")
.required(true)
Expand All @@ -103,8 +108,11 @@ impl Options {
}
options.blacklist_domains = app.is_present("blacklist-domains");
options.no_css = app.is_present("no-css");
if let Some(charset) = app.value_of("charset") {
options.charset = Some(charset.to_string());
if let Some(cookie_file) = app.value_of("cookies") {
options.cookie_file = Some(cookie_file.to_string());
}
if let Some(encoding) = app.value_of("encoding") {
options.encoding = Some(encoding.to_string());
}
if let Some(domains) = app.get_many::<String>("domains") {
let list_of_domains: Vec<String> = domains.map(|v| v.clone()).collect::<Vec<_>>();
Expand Down
14 changes: 12 additions & 2 deletions src/utils.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use reqwest::blocking::Client;
use reqwest::header::CONTENT_TYPE;
use reqwest::header::{HeaderMap, HeaderValue, CONTENT_TYPE, COOKIE};
use std::collections::HashMap;
use std::fs;
use std::path::{Path, PathBuf};
Expand Down Expand Up @@ -304,7 +304,17 @@ pub fn retrieve_asset(
}

// URL not in cache, we retrieve the file
match client.get(url.as_str()).send() {
let mut headers = HeaderMap::new();
if options.cookies.len() > 0 {
for cookie in &options.cookies {
if !cookie.is_expired() && cookie.matches_url(url.as_str()) {
let cookie_header_value: String = cookie.name.clone() + "=" + &cookie.value;
headers
.insert(COOKIE, HeaderValue::from_str(&cookie_header_value).unwrap());
}
}
}
match client.get(url.as_str()).headers(headers).send() {
Ok(response) => {
if !options.ignore_errors && response.status() != reqwest::StatusCode::OK {
if !options.silent {
Expand Down
4 changes: 2 additions & 2 deletions tests/cli/unusual_encodings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ mod passing {
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
let out = cmd
.arg("-M")
.arg("-C")
.arg("-E")
.arg("utf8")
.arg(format!(
"tests{s}_data_{s}unusual_encodings{s}gb2312.html",
Expand Down Expand Up @@ -158,7 +158,7 @@ mod passing {
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
let out = cmd
.arg("-M")
.arg("-C")
.arg("-E")
.arg("utf0")
.arg(format!(
"tests{s}_data_{s}unusual_encodings{s}gb2312.html",
Expand Down
Loading

0 comments on commit 78c3795

Please sign in to comment.