Skip to content

Commit

Permalink
new: Make categorization more accessible
Browse files Browse the repository at this point in the history
  • Loading branch information
Rafiot committed Jan 15, 2025
1 parent df3833f commit e38b33a
Show file tree
Hide file tree
Showing 13 changed files with 240 additions and 213 deletions.
5 changes: 3 additions & 2 deletions bin/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,11 @@ def check_poetry_version() -> None:
version = poetry_version_str.split()[2]
version = version.strip(')')
version_details = tuple(int(i) for i in version.split('.'))
if version_details < (1, 3, 0):
print('Lookyloo requires poetry >= 1.3.0, please update.')
if version_details < (2, 0, 0):
print('Lookyloo requires poetry >= 2.0.0, please update.')
print('If you installed with "pip install --user poetry", run "pip install --user -U poetry"')
print('If you installed via the recommended method, use "poetry self update"')
print('If you installed via pipx, use "pipx autoupdate"')
print('More details: https://github.com/python-poetry/poetry#updating-poetry')
sys.exit()

Expand Down
30 changes: 28 additions & 2 deletions lookyloo/capturecache.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,10 @@
from redis import Redis

from .context import Context
from .helpers import get_captures_dir, is_locked, load_pickle_tree, get_pickle_path, remove_pickle_tree, get_indexing, mimetype_to_generic
from .helpers import (get_captures_dir, is_locked, load_pickle_tree, get_pickle_path,
remove_pickle_tree, get_indexing, mimetype_to_generic, CaptureSettings)
from .default import LookylooException, try_make_file, get_config
from .exceptions import MissingCaptureDirectory, NoValidHarFile, MissingUUID, TreeNeedsRebuild
from .exceptions import MissingCaptureDirectory, NoValidHarFile, MissingUUID, TreeNeedsRebuild, InvalidCaptureSetting
from .modules import Cloudflare


Expand Down Expand Up @@ -108,6 +109,31 @@ def tree(self) -> CrawledTree:
time.sleep(5)
return load_pickle_tree(self.capture_dir, self.capture_dir.stat().st_mtime, self.logger)

@property
def categories(self) -> set[str]:
categ_file = self.capture_dir / 'categories'
if categ_file.exists():
with categ_file.open() as f:
return {line.strip() for line in f.readlines()}
return set()

@categories.setter
def categories(self, categories: set[str]) -> None:
categ_file = self.capture_dir / 'categories'
with categ_file.open('w') as f:
f.write('\n'.join(categories))

@property
def capture_settings(self) -> CaptureSettings | None:
capture_settings_file = self.capture_dir / 'capture_settings.json'
if capture_settings_file.exists():
try:
with capture_settings_file.open() as f:
return CaptureSettings(**json.load(f))
except InvalidCaptureSetting as e:
self.logger.warning(f'[In file!] Invalid capture settings for {self.uuid}: {e}')
return None


def serialize_sets(obj: Any) -> Any:
if isinstance(obj, set):
Expand Down
47 changes: 24 additions & 23 deletions lookyloo/lookyloo.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,37 +314,38 @@ def get_capture_settings(self, capture_uuid: str, /) -> CaptureSettings | None:
cache = self.capture_cache(capture_uuid)
if not cache:
return None
cs_file = cache.capture_dir / 'capture_settings.json'
if cs_file.exists():
try:
with cs_file.open('r') as f:
return CaptureSettings(**json.load(f))
except CaptureSettingsError as e:
self.logger.warning(f'[In file!] Invalid capture settings for {capture_uuid}: {e}')
return None

return None
return cache.capture_settings

def categorize_capture(self, capture_uuid: str, /, category: str) -> None:
def categorize_capture(self, capture_uuid: str, /, categories: list[str], *, as_admin: bool=False) -> tuple[set[str], set[str]]:
'''Add a category (MISP Taxonomy tag) to a capture.'''
if not get_config('generic', 'enable_categorization'):
return
# Make sure the category is mappable to a taxonomy.
# self.taxonomies.revert_machinetag(category)
return set(), set()

# Make sure the category is mappable to the dark-web taxonomy
valid_categories = set()
invalid_categories = set()
for category in categories:
taxonomy, predicate, name = self.taxonomies.revert_machinetag(category) # type: ignore[misc]
if not taxonomy or not predicate or not name and taxonomy.name != 'dark-web':
self.logger.warning(f'Invalid category: {category}')
invalid_categories.add(category)
else:
valid_categories.add(category)

categ_file = self._captures_index[capture_uuid].capture_dir / 'categories'
# get existing categories if possible
if categ_file.exists():
with categ_file.open() as f:
current_categories = {line.strip() for line in f.readlines()}
if as_admin:
# Keep categories that aren't a part of the dark-web taxonomy, force the rest
current_categories = {c for c in self._captures_index[capture_uuid].categories if not c.startswith('dark-web')}
current_categories |= valid_categories
else:
current_categories = set()
current_categories.add(category)
with categ_file.open('w') as f:
f.writelines(f'{t}\n' for t in current_categories)
# Only add categories.
current_categories = self._captures_index[capture_uuid].categories
current_categories |= valid_categories
self._captures_index[capture_uuid].categories = current_categories

get_indexing().reindex_categories_capture(capture_uuid)
if get_config('generic', 'index_everything'):
get_indexing(full=True).reindex_categories_capture(capture_uuid)
return valid_categories, invalid_categories

def uncategorize_capture(self, capture_uuid: str, /, category: str) -> None:
'''Remove a category (MISP Taxonomy tag) from a capture.'''
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -107,5 +107,5 @@ types-pytz = "^2024.2.0.20241221"
types-psutil = "^6.1.0.20241221"

[build-system]
requires = ["poetry-core"]
requires = ["poetry-core>=2.0"]
build-backend = "poetry.core.masonry.api"
9 changes: 6 additions & 3 deletions tools/3rdparty.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@

d3js_version = '7.9.0'
jquery_version = "3.7.1"
datatables_version = "2.1.8"
datatables_version = "2.2.1"
datatables_rowgroup_version = "1.5.1"
datatables_buttons_version = "3.2.0"
datatables_select_version = "3.0.0"

if __name__ == '__main__':
dest_dir = get_homedir() / 'website' / 'web' / 'static'
Expand All @@ -21,12 +24,12 @@
f.write(jquery.content)
print(f'Downloaded jquery v{jquery_version}.')

datatables_js = requests.get(f'https://cdn.datatables.net/v/bs4/dt-{datatables_version}/datatables.min.js')
datatables_js = requests.get(f'https://cdn.datatables.net/v/bs5/dt-{datatables_version}/b-{datatables_buttons_version}/rg-{datatables_rowgroup_version}/sl-{datatables_select_version}/datatables.min.js')
with (dest_dir / 'datatables.min.js').open('wb') as f:
f.write(datatables_js.content)
print(f'Downloaded datatables js v{datatables_version}.')

datatables_css = requests.get(f'https://cdn.datatables.net/v/bs4/dt-{datatables_version}/datatables.min.css')
datatables_css = requests.get(f'https://cdn.datatables.net/v/bs5/dt-{datatables_version}/b-{datatables_buttons_version}/rg-{datatables_rowgroup_version}/sl-{datatables_select_version}/datatables.min.css')
with (dest_dir / 'datatables.min.css').open('wb') as f:
f.write(datatables_css.content)
print(f'Downloaded datatables_css v{datatables_version}.')
Expand Down
88 changes: 36 additions & 52 deletions website/web/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -662,59 +662,39 @@ def historical_lookups(tree_uuid: str) -> str | WerkzeugResponse | Response:
circl_pdns=data.get('circl_pdns'))


@app.route('/tree/<string:tree_uuid>/categories_capture/', defaults={'query': ''}, methods=['GET', 'POST'])
@app.route('/tree/<string:tree_uuid>/categories_capture/<string:query>', methods=['GET'])
@flask_login.login_required # type: ignore[misc]
def categories_capture(tree_uuid: str, query: str) -> str | WerkzeugResponse | Response:
@app.route('/tree/<string:tree_uuid>/categories_capture', methods=['GET', 'POST'])
def categories_capture(tree_uuid: str) -> str | WerkzeugResponse | Response:
if not enable_categorization:
return redirect(url_for('tree', tree_uuid=tree_uuid))
matching_categories: dict[str, Any] = {}
if 'verification-status' in request.form:
status = request.form.get('verification-status')
# fast categories
categories = []
possible_ctgs = {
'legitimate': ["parking-page", "default-page", 'institution', 'captcha', 'authentication-form', 'adult-content', 'shop'],
'malicious': ['clone', 'phishing', 'captcha', 'authentication-form', 'adult-content', 'shop'],
'unclear': ['captcha', 'authentication-form', 'adult-content', 'shop']
}
if status in possible_ctgs.keys():
lookyloo.categorize_capture(tree_uuid, status)
for category in possible_ctgs[status]:
if category in request.form:
categories.append(category)
for category in categories:
lookyloo.categorize_capture(tree_uuid, category)
if 'query' in request.form and request.form.get('query', '').strip():
matching_categories = {}
t = get_taxonomies()
entries = t.search(query)
if entries:
matching_categories = {e: t.revert_machinetag(e) for e in entries}
current_categories = get_indexing(flask_login.current_user).get_capture_categories(tree_uuid)
return render_template('categories_capture.html', tree_uuid=tree_uuid,
current_categories=current_categories,
matching_categories=matching_categories)


@app.route('/tree/<string:tree_uuid>/uncategorize/', defaults={'category': ''})
@app.route('/tree/<string:tree_uuid>/uncategorize/<string:category>', methods=['GET'])
@flask_login.login_required # type: ignore[misc]
def uncategorize_capture(tree_uuid: str, category: str) -> str | WerkzeugResponse | Response:
if not enable_categorization:
return jsonify({'response': 'Categorization not enabled.'})
lookyloo.uncategorize_capture(tree_uuid, category)
return jsonify({'response': f'{category} successfully removed from {tree_uuid}'})

taxonomies = get_taxonomies()
as_admin = flask_login.current_user.is_authenticated

@app.route('/tree/<string:tree_uuid>/categorize/', defaults={'category': ''})
@app.route('/tree/<string:tree_uuid>/categorize/<string:category>', methods=['GET'])
@flask_login.login_required # type: ignore[misc]
def categorize_capture(tree_uuid: str, category: str) -> str | WerkzeugResponse | Response:
if not enable_categorization:
return jsonify({'response': 'Categorization not enabled.'})
lookyloo.categorize_capture(tree_uuid, category)
return jsonify({'response': f'{category} successfully added to {tree_uuid}'})
if request.method == 'GET':
if as_admin:
can_categorize = True
else:
can_categorize = False
if cache := lookyloo.capture_cache(tree_uuid):
current_categories = cache.categories
# only allow categorizing as user if the capture is less than 24h old
if not as_admin and cache.timestamp >= datetime.now().astimezone() - timedelta(days=1):
can_categorize = True
else:
current_categories = set()
return render_template('categories_view.html', tree_uuid=tree_uuid,
current_categories=current_categories,
can_categorize=can_categorize,
taxonomy=taxonomies.get('dark-web'))

# Got a POST
# If admin, we can remove categories, otherwise, we only add new ones.
categories = request.form.getlist('categories')
current, error = lookyloo.categorize_capture(tree_uuid, categories, as_admin=as_admin)
if current:
flash(f"Current categories {', '.join(current)}", 'success')
if error:
flash(f"Unable to add categories {', '.join(error)}", 'error')
return redirect(url_for('tree', tree_uuid=tree_uuid))


@app.route('/tree/<string:tree_uuid>/stats', methods=['GET'])
Expand Down Expand Up @@ -1404,6 +1384,7 @@ def index_generic(show_hidden: bool=False, show_error: bool=True, category: str
titles = sorted(titles, key=lambda x: (x[2], x[3]), reverse=True)
return render_template('index.html', titles=titles, public_domain=lookyloo.public_domain,
show_hidden=show_hidden,
category=category,
show_project_page=get_config('generic', 'show_project_page'),
enable_takedown_form=get_config('generic', 'enable_takedown_form'),
version=pkg_version)
Expand All @@ -1416,7 +1397,7 @@ def get_index_params(request: Request) -> tuple[bool, str]:
show_error = True if (request.args.get('show_error') and request.args.get('show_error') == 'True') else False

if enable_categorization:
category = request.args['category'] if request.args.get('category') else ''
category = unquote_plus(request.args['category']) if request.args.get('category') else ''
return show_error, category


Expand Down Expand Up @@ -1484,7 +1465,10 @@ def ressources() -> str:

@app.route('/categories', methods=['GET'])
def categories() -> str:
return render_template('categories.html', categories=get_indexing(flask_login.current_user).categories)
categories: list[tuple[str, int]] = []
for c in get_indexing(flask_login.current_user).categories:
categories.append((c, get_indexing(flask_login.current_user).get_captures_category_count(c)))
return render_template('categories.html', categories=categories)


@app.route('/rebuild_all')
Expand Down
8 changes: 4 additions & 4 deletions website/web/sri.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
"cookie_received.png": "EqL5fRFwjjXkSp242nacVFy7N8f1QAGJv4OIVDKQkDJQvq2MphwUnfLZUQvN3NMayHS/VTGQbgdQVjcOSQ2blA==",
"css.png": "XDfV8fW5XRQlHT20rZn3d6LdIp2Dzk+mnZlicBv61iJGFMENLSM4SDgRcGb+x927AlI3lb6qv2C6tJAR2nDl5g==",
"d3.min.js": "vc58qvvBdrDR4etbxMdlTt4GBQk1qjvyORR2nrsPsFPyrs+/u5c3+1Ct6upOgdZoIl7eq6k3a1UPDSNAQi/32A==",
"datatables.min.css": "d/ZytV0xigGpkuu/9Ttkpgu8zBLsG68DMAblggFmI0zkatHyiW08nw6pTeansbr4UoFQIvER8vZdgg4Wfm/Umg==",
"datatables.min.js": "jAsEnfxV9lbpxRXSwAivjF+eQsmLxo8oORZSMn0k9DytwlSO+lkWia5Rz7mfaRE1GgDwVjVt8LCtjd+NacPZug==",
"datatables.min.css": "oxhZ36t7WT9+cFh7jnOFF8etbUuCTwkocos8JmvRUyWZhl+RmKPocP3S3d8NLTlE8LLkbuz6hFiHW2q69R4zig==",
"datatables.min.js": "ZB9Vb/B86EfceALjJVP0ypZREqi3rnuuddJlmucP4fBlRB/4bJzZSgvQJJ02IITcTiPNnrcj0sxqanrbusTabg==",
"down.jpg": "LHRHJ5yCaSjNcDfEoChGIfh7K5HrMYbaGn7EOlxgZ8GoLIwb0nFBkpoOMG9gMHA/pBX2skkXMukvKJC6P6FBGg==",
"down_left.jpg": "UwHkJaZGayY1LewuFM3bJHQCUPG1vYyrVeiGG5mCM9MD9FtAhdbD4hBY3JZNDWv93CXeEAbxL1kqEeHTKnyquQ==",
"download.png": "J8y1gDKURf3AhgYDuqCnfaVLKRG2MI6k37xSvR5pJBAZ3aNmA6dDw6+UGf65hLBN3eGksaBJUeroBW/LDlUTqQ==",
Expand All @@ -33,13 +33,13 @@
"loader.gif": "ZZKD5vLSKBWKeUpa2KI9qheUJ49iTI/UULmVU/AX28fBfH00K3lLc2v5pVJZ4qXG1BbB13LTXzRKKU35H2XfNg==",
"lookyloo.jpeg": "i6wBj8CsIM5YAQLEMQfhs3CNOSKkErF8AMqqM6ZygSwCyQgv9CU8xt94veMZhM/ufBWoz7kAXmR+yywmxsTxug==",
"redirect.png": "PAjzlPV97rEFvH55mG1ZC9wRl98be3yMeX/nENuFkJcds6/AXgSR2ig/QyPULgobSnNgiYieLVWY/oqsgeywrQ==",
"render_tables.js": "+yQvQ54MoxseE8r3oWvzTrWh2lqQlFwNKnWWxGUgqD8shK1uMr4JE4BYudHpH39NABdOUUZpiJRAzzXWWio3DQ==",
"render_tables.js": "oZpiPzYELOjcgKm49QB0xzSoYFWtKEIDMpUu3oYKZ7GGL3D1ARhba0t40yZTJkAM6gM6BhaQ5L58XkHXHavNzg==",
"secure.svg": "H8ni7t0d60nCJDVGuZpuxC+RBy/ipAjWT627D12HlZGg6LUmjSwPTQTUekm3UJupEP7TUkhXyq6WHc5gy7QBjg==",
"stats.css": "/kY943FwWBTne4IIyf7iBROSfbGd82TeBicEXqKkRwawMVRIvM/Pk5MRa7okUyGIxaDjFQGmV/U1vy+PhN6Jbw==",
"stats_graph.js": "S/sMNQK1UMMLD0xQeEa7sq3ce8o6oPxwxGlyKVtaHOODjair86dbBDm7cu6pa/elMRDJT1j09jEFjWp+5GbhTw==",
"tree.css": "jc7+RiJaZy7utfMu7iMWicpt0y0ZFiEQlB4c7MFNdlWcZf0czi3LgSQUFlDWt828Mx463V+JP1RalXuRjbGcEg==",
"tree.js": "5dHZ3npV2YHsPlng1OtxPCOcTjTx1/N0KjrwDoIp4+NS7JMTu/pgaQoDVgtISjZEm1Vb0mra+oQ4eY2arZfbyA==",
"tree_modals.js": "E3SbfY0PXAwAOjaTu+l8VJ1L84bM7VgMRt3xEwdsPmstipyAlFtJAUTBV1FoV6nXRp7vKqeHtKDvncgapGs1Uw==",
"tree_modals.js": "aXXLQLpuCjaDPeq1V3tXSWtoGLszaeEKn7S2hsJLbaEwTNMtwoZwYX5F8XriSHNIgDj23RZP4xoGk9q7Hvkz7A==",
"up.jpg": "d1ljZJ9f5JekyM6RLFFH2Ua44j6neiQBdUIXOenRTjGppQr3JaeglpQIH6BjPCJL177+TH52U3UIRNS5YAyKIg==",
"up_right.jpg": "OMmz+n+MxR34P8/fn5t4DkqKqdJRzQbXQ7fAi2lhkZIJGhVs2vIyY1f2hpYoBxDAX1OcYsSE2lqIR2vXNDGZsA==",
"video.png": "gJtmkfr8I1Kw43pYEKjg6CAjgmhl1vIBKBQ3ZkxCu3wvxQm+6kf93iLrrFiY2WuiXzxEn2Leu52GJzmVN5id0g==",
Expand Down
53 changes: 53 additions & 0 deletions website/web/static/render_tables.js
Original file line number Diff line number Diff line change
Expand Up @@ -238,4 +238,57 @@
],
});
}
if (document.getElementById('category_table')) {
let cat_table = new DataTable('#category_table', {
retrieve: true,
drawCallback: function (settings) { newTabClickListener() },
order: [[ 0, "desc" ]],
pageLength: 25,
lengthMenu: [25, 50, {label: 'All', value:-1} ],

rowGroup: {
dataSrc: [0],
},
columns: [{visible: false },
{ width: '60%', orderable: false },
{ width: '35%', orderable: false },
{ width: '5%', orderable: false, render: DataTable.render.select()}],
select: {
style: 'multi',
headerCheckbox: false,
},
layout: {
topStart: {
buttons: [
{
extend: 'selected',
text: 'Review categories',
action: function (e, dt, button, config) {
let counter = dt.rows( { selected: true } ).count()
let tags = dt.cells( dt.rows( { selected: true } ).nodes(), 2).data().toArray();
document.getElementById('categories_counter').innerText = counter;
let list = document.getElementById("categories_selected");
list.innerHTML = '';
tags.forEach((item) => {
let elt = document.createElement("div");
elt.className = "form-check";
elt.innerHTML = `<input class="form-check-input" type="checkbox" name="categories" value='${item}' checked hidden> <label class="form-check-label">${item}</label>`;
list.appendChild(elt);
});
document.getElementById('new_categories').style.display = 'block';
}
}
],
}
}
});

cat_table.rows('.selected').select();
cat_table.on('user-select', function (e, dt, type, cell, originalEvent) {
if (originalEvent.target.parentNode.classList.contains("unselectable") ||
originalEvent.target.parentNode.parentNode.classList.contains("unselectable")) {
e.preventDefault();
}
});
}
}));
Loading

0 comments on commit e38b33a

Please sign in to comment.