Skip to content

Commit

Permalink
feat: add --output and --latest options
Browse files Browse the repository at this point in the history
  • Loading branch information
thunderpoot committed Feb 27, 2024
1 parent b933f04 commit c226a94
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 6 deletions.
16 changes: 14 additions & 2 deletions python3/scdx
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,19 @@ def ctrlc(sig,frame):

signal.signal(signal.SIGINT,ctrlc)

# Argument parsing setup
parser = argparse.ArgumentParser(description="Crawl data collection script.")
parser.add_argument("-s", "--sleep", type=int, default=2, help="Sleep duration in seconds.")
parser.add_argument("-d", "--domain", type=str, required=True, help="Domain to search for.")
parser.add_argument("-c", "--crawls", nargs='*', help="Specify which crawl(s) to query. Default is all.")

# Mutually exclusive group for --latest and --crawls
group = parser.add_mutually_exclusive_group()
group.add_argument("-l", "--latest", action="store_true", help="Only check the latest crawl.")
group.add_argument("-c", "--crawls", nargs='*', help="Specify which crawl(s) to query. Default is all.")

# Output filename argument
parser.add_argument("-o", "--output", type=str, help="Specify the output filename.")

args = parser.parse_args()

zzz = args.sleep # sleep duration from command line
Expand All @@ -42,8 +51,11 @@ if args.crawls:
else:
filtered_crawls = crawls

if args.latest:
filtered_crawls = [crawls[0]] # Select only the first entry which is the latest crawl

# Generate a filename with the current date and time
filename = datetime.now().strftime("%Y-%m-%d_%H-%M-%S_output.jsonl")
filename = args.output if args.output else datetime.now().strftime("%Y-%m-%d_%H-%M-%S_output.jsonl")

# Open the file in write mode
with open(filename, 'w') as outfile:
Expand Down
29 changes: 25 additions & 4 deletions rust/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,28 @@ fn main() {
.multiple(true)
.takes_value(true),
)
.arg(
Arg::with_name("latest")
.short("l")
.long("latest")
.help("Check only the latest crawl")
.takes_value(false)
.conflicts_with("crawls"),
)
.arg(
Arg::with_name("output")
.short("o")
.long("output")
.value_name("FILENAME")
.help("Specify the output filename")
.takes_value(true),
)
.get_matches();

let is_latest = matches.is_present("latest");
let output_filename = matches.value_of("output").map(|s| s.to_string()).unwrap_or_else(|| {
Local::now().format("%Y-%m-%d_%H-%M-%S_output.jsonl").to_string()
});
let sleep_duration = matches.value_of("sleep").unwrap().parse::<u64>().unwrap();
let domain = matches.value_of("domain").unwrap();
let crawls: Vec<&str> = matches.values_of("crawls").unwrap_or_default().collect();
Expand All @@ -53,14 +73,15 @@ fn main() {

if response.status().is_success() {
let crawls_data: Vec<Value> = response.json().unwrap();
let filtered_crawls: Vec<&Value> = if !crawls.is_empty() {
let filtered_crawls: Vec<&Value> = if is_latest {
crawls_data.iter().take(1).collect() // Assuming the first one is the latest
} else if !crawls.is_empty() {
crawls_data.iter().filter(|crawl| crawls.contains(&crawl["id"].as_str().unwrap())).collect()
} else {
crawls_data.iter().collect()
};

let filename = Local::now().format("%Y-%m-%d_%H-%M-%S_output.jsonl").to_string();
let mut file = File::create(&filename).unwrap();
let mut file = File::create(&output_filename).unwrap();

let pb = ProgressBar::new(filtered_crawls.len() as u64);
pb.set_style(ProgressStyle::default_bar()
Expand Down Expand Up @@ -103,7 +124,7 @@ fn main() {
}
}
pb.finish_with_message("Data collection complete.");
println!("Results saved to {}.", filename);
println!("Results saved to {}.", output_filename);
} else {
println!("Failed to fetch collinfo.json");
}
Expand Down

0 comments on commit c226a94

Please sign in to comment.