Skip to content

Commit 10eee92

Browse files
committed
feat(dwarfsck): add --list, --checksum, --verbose (fixes gh #192)
1 parent 3d3e7e9 commit 10eee92

File tree

3 files changed

+243
-4
lines changed

3 files changed

+243
-4
lines changed

doc/dwarfsck.md

+16
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@ with a non-zero exit code.
2525
- `-q`, `--quiet`:
2626
Don't produce any output unless there is an error.
2727

28+
- `-v`, `--verbose`:
29+
Produce verbose output, where applicable.
30+
2831
- `-O`, `--image-offset=`*value*|`auto`:
2932
Specify the byte offset at which the filesystem is located in the image.
3033
Use `auto` to detect the offset automatically. This is also the default.
@@ -36,6 +39,19 @@ with a non-zero exit code.
3639
header is present, the program will exit with exit code 2 and emit a
3740
warning.
3841

42+
- `-l`, `--list`:
43+
List all entries in the file system image. Uses output similar to `tar -t`.
44+
With `--verbose`, also print details about each entry.
45+
46+
- `--checksum=`*name*:
47+
Produce a checksum using the specified algorithm for each regular file in
48+
the file system image. This can be used to easily verify the file system
49+
image against local files, e.g.:
50+
51+
```
52+
dwarfsck --checksum=sha512 /tmp/fs.dwarfs | sha512sum --check
53+
```
54+
3955
- `-n`, `--num-workers=`*value*:
4056
Number of worker threads used for integrity checking.
4157

src/dwarfsck_main.cpp

+141-4
Original file line numberDiff line numberDiff line change
@@ -19,18 +19,25 @@
1919
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
2020
*/
2121

22+
#include <algorithm>
2223
#include <cstring>
2324
#include <iostream>
25+
#include <mutex>
2426
#include <string_view>
2527
#include <vector>
2628

2729
#include <boost/program_options.hpp>
2830

31+
#include <fmt/chrono.h>
32+
#include <fmt/format.h>
33+
2934
#include <folly/String.h>
35+
#include <folly/gen/String.h>
3036
#include <folly/json.h>
3137
#include <folly/portability/Unistd.h>
3238
#include <folly/system/HardwareConcurrency.h>
3339

40+
#include "dwarfs/checksum.h"
3441
#include "dwarfs/error.h"
3542
#include "dwarfs/file_access.h"
3643
#include "dwarfs/filesystem_v2.h"
@@ -41,24 +48,131 @@
4148
#include "dwarfs/os_access.h"
4249
#include "dwarfs/tool.h"
4350
#include "dwarfs/util.h"
51+
#include "dwarfs/worker_group.h"
4452
#include "dwarfs_tool_main.h"
4553

4654
namespace dwarfs {
4755

4856
namespace po = boost::program_options;
4957

58+
namespace {
59+
60+
void do_list_files(filesystem_v2& fs, iolayer const& iol, bool verbose) {
61+
auto max_width = [](auto const& vec) {
62+
auto max = std::max_element(vec.begin(), vec.end());
63+
return std::to_string(*max).size();
64+
};
65+
66+
auto const uid_width = max_width(fs.get_all_uids());
67+
auto const gid_width = max_width(fs.get_all_gids());
68+
69+
file_stat::off_type max_inode_size{0};
70+
fs.walk([&](auto const& de) {
71+
file_stat st;
72+
fs.getattr(de.inode(), &st);
73+
max_inode_size = std::max(max_inode_size, st.size);
74+
});
75+
76+
auto const inode_size_width = fmt::format("{:L}", max_inode_size).size();
77+
78+
fs.walk([&](auto const& de) {
79+
auto iv = de.inode();
80+
file_stat st;
81+
fs.getattr(iv, &st);
82+
auto name = de.unix_path();
83+
utf8_sanitize(name);
84+
85+
if (verbose) {
86+
if (iv.is_symlink()) {
87+
auto target = fs.readlink(iv).value();
88+
utf8_sanitize(target);
89+
name += " -> " + target;
90+
}
91+
92+
iol.out << fmt::format(
93+
"{3} {4:{0}}/{5:{1}} {6:{2}L} {7:%Y-%m-%d %H:%M} {8}\n", uid_width,
94+
gid_width, inode_size_width, iv.mode_string(), iv.getuid(),
95+
iv.getgid(), st.size, fmt::localtime(st.mtime), name);
96+
} else if (!name.empty()) {
97+
iol.out << name << "\n";
98+
}
99+
});
100+
}
101+
102+
void do_checksum(logger& lgr, filesystem_v2& fs, iolayer const& iol,
103+
std::string const& algo, size_t num_workers) {
104+
LOG_PROXY(debug_logger_policy, lgr);
105+
106+
worker_group wg{lgr, *iol.os, "checksum", num_workers};
107+
std::mutex mx;
108+
109+
fs.walk_data_order([&](auto const& de) {
110+
auto iv = de.inode();
111+
if (iv.is_regular_file()) {
112+
wg.add_job([&, de, iv] {
113+
file_stat st;
114+
115+
if (fs.getattr(de.inode(), &st) != 0) {
116+
LOG_ERROR << "failed to get attributes for inode " << iv.inode_num();
117+
return;
118+
}
119+
120+
auto ranges = fs.readv(iv.inode_num(), st.size);
121+
122+
if (!ranges) {
123+
LOG_ERROR << "failed to read inode " << iv.inode_num() << ": "
124+
<< std::strerror(-ranges.error());
125+
return;
126+
}
127+
128+
checksum cs(algo);
129+
130+
for (auto& fut : ranges.value()) {
131+
try {
132+
auto range = fut.get();
133+
cs.update(range.data(), range.size());
134+
} catch (std::exception const& e) {
135+
LOG_ERROR << "error reading data from inode " << iv.inode_num()
136+
<< ": " << e.what();
137+
return;
138+
}
139+
}
140+
141+
auto output = fmt::format("{} {}\n", cs.hexdigest(), de.unix_path());
142+
143+
{
144+
std::lock_guard lock(mx);
145+
iol.out << output;
146+
}
147+
});
148+
}
149+
});
150+
151+
wg.wait();
152+
}
153+
154+
} // namespace
155+
50156
int dwarfsck_main(int argc, sys_char** argv, iolayer const& iol) {
157+
using namespace folly::gen;
158+
51159
const size_t num_cpu = std::max(folly::hardware_concurrency(), 1u);
52160

53-
std::string input, export_metadata, image_offset;
161+
auto algo_list = checksum::available_algorithms();
162+
auto checksum_desc = "print checksums for all files (" +
163+
(from(algo_list) | unsplit(", ")) + ")";
164+
165+
std::string input, export_metadata, image_offset, checksum_algo;
54166
logger_options logopts;
55167
size_t num_workers;
56168
int detail;
57169
bool quiet{false};
170+
bool verbose{false};
58171
bool output_json{false};
59172
bool check_integrity{false};
60173
bool no_check{false};
61174
bool print_header{false};
175+
bool list_files{false};
62176

63177
// clang-format off
64178
po::options_description opts("Command line options");
@@ -72,12 +186,21 @@ int dwarfsck_main(int argc, sys_char** argv, iolayer const& iol) {
72186
("quiet,q",
73187
po::value<bool>(&quiet)->zero_tokens(),
74188
"don't print anything unless an error occurs")
189+
("verbose,v",
190+
po::value<bool>(&verbose)->zero_tokens(),
191+
"produce verbose output")
75192
("image-offset,O",
76193
po::value<std::string>(&image_offset)->default_value("auto"),
77194
"filesystem image offset in bytes")
78195
("print-header,H",
79196
po::value<bool>(&print_header)->zero_tokens(),
80197
"print filesystem header to stdout and exit")
198+
("list,l",
199+
po::value<bool>(&list_files)->zero_tokens(),
200+
"list all files and exit")
201+
("checksum",
202+
po::value<std::string>(&checksum_algo),
203+
checksum_desc.c_str())
81204
("num-workers,n",
82205
po::value<size_t>(&num_workers)->default_value(num_cpu),
83206
"number of reader worker threads")
@@ -138,10 +261,16 @@ int dwarfsck_main(int argc, sys_char** argv, iolayer const& iol) {
138261
return 1;
139262
}
140263

264+
if (vm.count("checksum") && !checksum::is_available(checksum_algo)) {
265+
LOG_WARN << "checksum algorithm not available: " << checksum_algo;
266+
return 1;
267+
}
268+
141269
if (print_header &&
142-
(output_json || !export_metadata.empty() || check_integrity)) {
270+
(output_json || !export_metadata.empty() || check_integrity ||
271+
list_files || !checksum_algo.empty())) {
143272
LOG_WARN << "--print-header is mutually exclusive with --json, "
144-
"--export-metadata and --check-integrity";
273+
"--export-metadata, --check-integrity, --list and --checksum";
145274
return 1;
146275
}
147276

@@ -191,14 +320,22 @@ int dwarfsck_main(int argc, sys_char** argv, iolayer const& iol) {
191320
: filesystem_check_level::CHECKSUM;
192321
auto errors = no_check ? 0 : fs.check(level, num_workers);
193322

194-
if (!quiet) {
323+
if (!quiet && !list_files && checksum_algo.empty()) {
195324
if (output_json) {
196325
iol.out << folly::toPrettyJson(fs.info_as_dynamic(detail)) << "\n";
197326
} else {
198327
fs.dump(iol.out, detail);
199328
}
200329
}
201330

331+
if (list_files) {
332+
do_list_files(fs, iol, verbose);
333+
}
334+
335+
if (!checksum_algo.empty()) {
336+
do_checksum(lgr, fs, iol, checksum_algo, num_workers);
337+
}
338+
202339
if (errors > 0) {
203340
return 1;
204341
}

test/tool_main_test.cpp

+86
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
#include <gmock/gmock.h>
3232
#include <gtest/gtest.h>
3333

34+
#include <fmt/chrono.h>
3435
#include <fmt/format.h>
3536

3637
#include <folly/FileUtil.h>
@@ -2054,6 +2055,91 @@ TEST(dwarfsck_test, export_metadata_close_error) {
20542055
::testing::HasSubstr("failed to close metadata output file"));
20552056
}
20562057

2058+
TEST(dwarfsck_test, checksum_algorithm_not_available) {
2059+
auto t = dwarfsck_tester::create_with_image();
2060+
EXPECT_NE(0, t.run({"image.dwarfs", "--checksum=grmpf"})) << t.err();
2061+
EXPECT_THAT(t.err(),
2062+
::testing::HasSubstr("checksum algorithm not available: grmpf"));
2063+
}
2064+
2065+
TEST(dwarfsck_test, list_files) {
2066+
auto t = dwarfsck_tester::create_with_image();
2067+
EXPECT_EQ(0, t.run({"image.dwarfs", "--list"})) << t.err();
2068+
auto out = t.out();
2069+
2070+
std::set<std::string> files;
2071+
folly::splitTo<std::string>('\n', out, std::inserter(files, files.end()),
2072+
true);
2073+
2074+
std::set<std::string> const expected{
2075+
"test.pl", "somelink", "somedir", "foo.pl",
2076+
"bar.pl", "baz.pl", "ipsum.txt", "somedir/ipsum.py",
2077+
"somedir/bad", "somedir/empty", "empty",
2078+
};
2079+
2080+
EXPECT_EQ(expected, files);
2081+
}
2082+
2083+
TEST(dwarfsck_test, list_files_verbose) {
2084+
auto t = dwarfsck_tester::create_with_image();
2085+
EXPECT_EQ(0, t.run({"image.dwarfs", "--list", "--verbose"})) << t.err();
2086+
auto out = t.out();
2087+
2088+
auto num_lines = std::count(out.begin(), out.end(), '\n');
2089+
EXPECT_EQ(12, num_lines);
2090+
2091+
std::vector<std::string> expected_re{
2092+
fmt::format("drwxrwxrwx\\s+1000/100\\s+8\\s+{:%Y-%m-%d %H:%M}\\s*\n",
2093+
fmt::localtime(2)),
2094+
fmt::format(
2095+
"-rw-------\\s+1337/ 0\\s+{:L}\\s+{:%Y-%m-%d %H:%M}\\s+baz.pl\n",
2096+
23456, fmt::localtime(8002)),
2097+
fmt::format("lrwxrwxrwx\\s+1000/100\\s+16\\s+{:%Y-%m-%d "
2098+
"%H:%M}\\s+somelink -> somedir/ipsum.py\n",
2099+
fmt::localtime(2002)),
2100+
};
2101+
2102+
for (auto const& str : expected_re) {
2103+
std::regex re{str};
2104+
EXPECT_TRUE(std::regex_search(out, re)) << "[" << str << "]\n" << out;
2105+
}
2106+
}
2107+
2108+
TEST(dwarfsck_test, checksum_files) {
2109+
auto t = dwarfsck_tester::create_with_image();
2110+
EXPECT_EQ(0, t.run({"image.dwarfs", "--checksum=md5"})) << t.err();
2111+
auto out = t.out();
2112+
2113+
auto num_lines = std::count(out.begin(), out.end(), '\n');
2114+
EXPECT_EQ(8, num_lines);
2115+
2116+
std::map<std::string, std::string> actual;
2117+
std::vector<std::string_view> lines;
2118+
folly::split('\n', out, lines);
2119+
2120+
for (auto const& line : lines) {
2121+
if (line.empty()) {
2122+
continue;
2123+
}
2124+
std::string file, hash;
2125+
folly::split(" ", line, hash, file);
2126+
EXPECT_TRUE(actual.emplace(file, hash).second);
2127+
}
2128+
2129+
std::map<std::string, std::string> const expected{
2130+
{"empty", "d41d8cd98f00b204e9800998ecf8427e"},
2131+
{"somedir/empty", "d41d8cd98f00b204e9800998ecf8427e"},
2132+
{"test.pl", "d41d8cd98f00b204e9800998ecf8427e"},
2133+
{"baz.pl", "e2bd36391abfd15dcc83cbdfb60a6bc3"},
2134+
{"somedir/ipsum.py", "70fe813c36ed50ebd7f4991857683676"},
2135+
{"foo.pl", "e2bd36391abfd15dcc83cbdfb60a6bc3"},
2136+
{"bar.pl", "e2bd36391abfd15dcc83cbdfb60a6bc3"},
2137+
{"ipsum.txt", "0782b6a546cedd8be8fc86ac47dc6d96"},
2138+
};
2139+
2140+
EXPECT_EQ(expected, actual);
2141+
}
2142+
20572143
class mkdwarfs_sim_order_test : public testing::TestWithParam<char const*> {};
20582144

20592145
TEST(mkdwarfs_test, max_similarity_size) {

0 commit comments

Comments
 (0)