From 8efd1217ef885c4465e601381521e50acf3798fe Mon Sep 17 00:00:00 2001 From: Daniel Lidstrom Date: Fri, 14 Aug 2020 00:50:02 +0200 Subject: [PATCH 1/5] improved instructions for running and compiling --- CMakeLists.txt | 2 +- README.md | 108 +++++++++++++++++++++++++++++-------------------- compile.sh | 3 ++ 3 files changed, 68 insertions(+), 45 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 712cbb0..a44c3c7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.15.5) project(duplo) file(GLOB SOURCES src/*.cpp) -SET(DUPLO_VERSION "1.0.0" CACHE STRING "Duplo version") +SET(DUPLO_VERSION "\"v1.0.0\"" CACHE STRING "Duplo version") if(MSVC) else() diff --git a/README.md b/README.md index 3fb02e4..8709099 100644 --- a/README.md +++ b/README.md @@ -1,42 +1,44 @@ -# 1. Duplo (C/C++/Java Duplicate Source Code Block Finder) +# Duplo (C/C++/Java Duplicate Source Code Block Finder) ![C/C++ CI](https://github.com/dlidstrom/Duplo/workflows/C/C++%20CI/badge.svg) -- [1. Duplo (C/C++/Java Duplicate Source Code Block Finder)](#1-duplo-ccjava-duplicate-source-code-block-finder) - - [1.1. General Information](#11-general-information) - - [1.2. Maintainer](#12-maintainer) - - [1.3. File Format Support](#13-file-format-support) - - [1.4. Installation](#14-installation) - - [1.4.1. Docker](#141-docker) - - [1.4.2. Pre-built binaries](#142-pre-built-binaries) - - [1.5. Usage](#15-usage) - - [1.5.1. Passing files using `stdin`](#151-passing-files-using-stdin) - - [1.5.2. Passing files using file](#152-passing-files-using-file) - - [1.5.3. Xml output](#153-xml-output) - - [1.6. Feedback and Bug Reporting](#16-feedback-and-bug-reporting) - - [1.7. Algorithm Background](#17-algorithm-background) - - [1.7.1. Performance Measurements](#171-performance-measurements) - - [1.8. Developing](#18-developing) - - [1.8.1. Unix](#181-unix) - - [1.8.2. Windows](#182-windows) - - [1.8.3. Additional Language Support](#183-additional-language-support) - - [1.8.4. Language Suggestions](#184-language-suggestions) - - [1.9. Changes](#19-changes) - - [1.10. License](#110-license) - -## 1.1. General Information +- [1. General Information](#1-general-information) +- [2. Maintainer](#2-maintainer) +- [3. File Format Support](#3-file-format-support) +- [4. Installation](#4-installation) + - [4.1. Docker](#41-docker) + - [4.2. Pre-built binaries](#42-pre-built-binaries) +- [5. Usage](#5-usage) + - [5.1. Passing files using `stdin`](#51-passing-files-using-stdin) + - [5.1.1. Bash](#511-bash) + - [5.1.2. Windows](#512-windows) + - [5.1.3. Docker](#513-docker) + - [5.2. Passing files using file](#52-passing-files-using-file) + - [5.3. Xml output](#53-xml-output) +- [6. Feedback and Bug Reporting](#6-feedback-and-bug-reporting) +- [7. Algorithm Background](#7-algorithm-background) + - [7.1. Performance Measurements](#71-performance-measurements) +- [8. Developing](#8-developing) + - [8.1. Unix](#81-unix) + - [8.2. Windows](#82-windows) + - [8.3. Additional Language Support](#83-additional-language-support) + - [8.4. Language Suggestions](#84-language-suggestions) +- [9. Changes](#9-changes) +- [10. License](#10-license) + +## 1. General Information Duplicated source code blocks can harm maintainability of software systems. Duplo is a tool to find duplicated code blocks in large C, C++, Java, C# and VB.Net systems. -## 1.2. Maintainer +## 2. Maintainer Duplo was originally developed by Christian M. Ammann and is now maintained and developed by Daniel Lidström. -## 1.3. File Format Support +## 3. File Format Support Duplo has built in support for the following file formats: @@ -75,9 +77,9 @@ src\engine\geometry\SkinnedMeshGeometry.cpp(45) ... ``` -## 1.4. Installation +## 4. Installation -### 1.4.1. Docker +### 4.1. Docker If you have Docker, the way to run Duplo is to use this command: @@ -88,34 +90,52 @@ If you have Docker, the way to run Duplo is to use this command: This pulls the latest image and runs duplo. Note that you'll have to pipe the filenames into this command. A complete commandline sample will be shown below. -### 1.4.2. Pre-built binaries +### 4.2. Pre-built binaries Duplo is also available as a pre-built binary for (alpine) linux and macos. Grab the executable from the [releases](https://github.com/dlidstrom/Duplo/releases) page. You can of course build from source as well, and you'll have to do so to get a binary for Windows. -## 1.5. Usage +## 5. Usage Duplo works with a list of files. You can either specify a file that contains the list of files, or you can pass them using `stdin`. Run `duplo --help` on the command line to see the detailed options. -### 1.5.1. Passing files using `stdin` +### 5.1. Passing files using `stdin` + +In each of the following commands, `duplo` will write the duplicated blocks into `out.txt` in addition to the information written to stdout. + +#### 5.1.1. Bash ```bash # unix > find . -type f \( -iname "*.cpp" -o -iname "*.h" \) | duplo - out.txt +``` +Let's break this down. `find . -type f \( -iname "*.cpp" -o -iname "*.h" \)` is a syntax to look recursively in the current directory (the `.` part) for files (the `-type f` part) matching `*.cpp` or `*.h` (case insensitive). The output from `find` is piped into `duplo` which then reads the filenames from `stdin` (the `-` tells `duplo` to get the filenames from `stdin`, a common unix convention in many commandline applications). The result of the analysis is then written to `out.txt`. + +#### 5.1.2. Windows + +```bash # windows > Get-ChildItem -Include "*.cpp", "*.h" -Recurse | % { $_.FullName } | Duplo.exe - out.txt +``` + +This works similarly to the Bash command, but uses PowerShell commands to achieve the same effect. + +#### 5.1.3. Docker +```bash # Docker on unix > find . -type f \( -iname "*.cpp" -or -iname "*.h" \) | docker run --rm -i -w /src -v $(pwd):/src dlidstrom/duplo - out.txt ``` -In each of the above commands, `duplo` will write the duplicated blocks into `out.txt` in addition to the information written to stdout. +This command also works in a similar fashion to the Bash command, but instead of piping into a local `duplo` executable, it will pipe into `duplo` running inside Docker. This is very convenient as you do not have to install `duplo` separately. You will have to install Docker though, if you haven't already. That is a good thing to do anyway, since it opens up a lot of possibilities apart from running `duplo`. + +Again, similarly to the Bash command, this uses `find` to find files in the current directory, then passes the file list to Docker which will pass it further into an instance of the latest version of `duplo`. The working directory in the `duplo` container should be `/src` (that's where the `duplo` executable is located) and the current path of your host machine will be mapped to `/src` when the container is running. The `-i` allows `stdin` of your host machine to be passed into Docker to allow `duplo` to read the filenames. Any parameters to `duplo` can be placed at the end of the command as you can see `- out.txt` has been. -### 1.5.2. Passing files using file +### 5.2. Passing files using file `duplo` can analyze files specified in a separate file: @@ -135,30 +155,30 @@ In each of the above commands, `duplo` will write the duplicated blocks into `ou Again, the duplicated blocks are written to `out.txt`. -### 1.5.3. Xml output +### 5.3. Xml output Duplo can also output xml and there is a stylesheet that will format the result for viewing in a browser. This can be used as a report tab in your continuous integration tool (TeamCity, etc). -## 1.6. Feedback and Bug Reporting +## 6. Feedback and Bug Reporting Please open an issue to discuss feedback, feature requests and bug reports. -## 1.7. Algorithm Background +## 7. Algorithm Background Duplo uses the same techniques as Duploc to detect duplicated code blocks. See [Duca99bCodeDuplication](http://scg.unibe.ch/archive/papers/Duca99bCodeDuplication.pdf) for further information. -### 1.7.1. Performance Measurements +### 7.1. Performance Measurements | System | Files | Loc's | Time | |-|-|-|-| | Quake2 | 266 | 102740 | 18sec | -## 1.8. Developing +## 8. Developing -### 1.8.1. Unix +### 8.1. Unix You need `CMake` and preferrably `fswatch` for the best experience. @@ -183,11 +203,11 @@ build/> popd > ./watch.sh ``` -### 1.8.2. Windows +### 8.2. Windows Use Visual Studio 2019 to open the included solution file (or try `CMake`). -### 1.8.3. Additional Language Support +### 8.3. Additional Language Support Duplo can analyze all text files regardless of format, but it has special support for some programming languages (C++, C#, Java, for example). This allows Duplo to improve the duplication detection as it can ignore preprocessor directives and/or comments. @@ -196,7 +216,7 @@ To implement support for a new language, there are a couple of options (in order 1. Implement `FileTypeBase` which has support for handling comments and preprocessor directives. You just need to decide what is a comment. With this option you need to implement a couple of methods, one which is `CreateLineFilter`. This is to remove multiline comments. Look at `CstyleCommentsFilter` for an example. 2. Implement `IFileType` interface directly. This gives you the most freedom but also is the hardest option of course. -### 1.8.4. Language Suggestions +### 8.4. Language Suggestions - JavaScript (easy, just look at the existing C-based ones) - Ruby @@ -212,7 +232,7 @@ To implement support for a new language, there are a couple of options (in order Send me a pull request! -## 1.9. Changes +## 9. Changes - 0.5 - Fixed malformed xml (thanks [@ArsMasiuk](https://github.com/ArsMasiuk)!) @@ -228,7 +248,7 @@ Send me a pull request! - Fixed limitation of total number of lines of code - Checking of arbitrary files -## 1.10. License +## 10. License Duplo is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/compile.sh b/compile.sh index 585e496..a94a538 100755 --- a/compile.sh +++ b/compile.sh @@ -1,4 +1,7 @@ #!/bin/bash +# to run this, first set the DUPLO_VERSION environment variable. Otherwise +# some of the tests might fail. So, do this: +# > export DUPLO_VERSION=v1.0.0 p() { now="$(date +'%r')" printf "$(tput setaf 1)%s$(tput sgr0) | $(tput bold)$1$(tput sgr0)\n" "$now"; From 57a61a32a5e1d5729b3f7b121f33a4225dce6b46 Mon Sep 17 00:00:00 2001 From: Daniel Lidstrom Date: Fri, 14 Aug 2020 00:50:28 +0200 Subject: [PATCH 2/5] test case for showing original line numbers, in progress --- tests/Simple/LineNumbers.c | 12 ++++++++++++ tests/Simple/LineNumbers.lst | 1 + tests/Simple/tests.bats | 9 +++++++++ 3 files changed, 22 insertions(+) create mode 100644 tests/Simple/LineNumbers.c create mode 100644 tests/Simple/LineNumbers.lst create mode 100644 tests/Simple/tests.bats diff --git a/tests/Simple/LineNumbers.c b/tests/Simple/LineNumbers.c new file mode 100644 index 0000000..063f33d --- /dev/null +++ b/tests/Simple/LineNumbers.c @@ -0,0 +1,12 @@ +AAAAA +BBBBB +CCCCC +DDDDD +EEEEE +/* some comment to offset the line numbers */ +AAAAA +BBBBB +CCCCC +DDDDD +EEEEE +FFFFF diff --git a/tests/Simple/LineNumbers.lst b/tests/Simple/LineNumbers.lst new file mode 100644 index 0000000..cafb953 --- /dev/null +++ b/tests/Simple/LineNumbers.lst @@ -0,0 +1 @@ +tests/Simple/LineNumbers.c diff --git a/tests/Simple/tests.bats b/tests/Simple/tests.bats new file mode 100644 index 0000000..3ec9f98 --- /dev/null +++ b/tests/Simple/tests.bats @@ -0,0 +1,9 @@ +@test "LineNumbers.c" { + run ./build/duplo tests/Simple/LineNumbers.lst out.txt && cat out.txt + printf 'lines %s\n' "${lines[@]}" >&2 + echo "status = $status" + [ "$status" -eq 0 ] + [ "${lines[0]}" = "Loading and hashing files ... 2 done." ] + [ "${lines[1]}" = "tests/Simple/LineNumbers.c found: 1 block(s)" ] + [ "${lines[1]}" = "tests/Simple/LineNumbers.c(6)" ] +} From 2d14e462ad60f5d319e51b15d97fd64e7da6b165 Mon Sep 17 00:00:00 2001 From: Daniel Lidstrom Date: Sat, 15 Aug 2020 02:20:28 +0200 Subject: [PATCH 3/5] basic tests working - also made line number output 1-based --- .gitignore | 1 + src/Duplo.cpp | 4 ++-- src/FileTypeBase.cpp | 2 +- src/SourceLine.cpp | 16 ++++++++-------- tests/Simple/test-xml.bats | 29 +++++++++++++++++++++++++++++ tests/Simple/tests.bats | 32 ++++++++++++++++++++++++++++---- 6 files changed, 69 insertions(+), 15 deletions(-) create mode 100644 tests/Simple/test-xml.bats diff --git a/.gitignore b/.gitignore index 701ad9a..ec2649c 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,7 @@ duplo build .vscode out.txt +out.xml files.lst CMakeFiles CMakeCache.txt diff --git a/src/Duplo.cpp b/src/Duplo.cpp index e3bfc4f..9df1291 100644 --- a/src/Duplo.cpp +++ b/src/Duplo.cpp @@ -426,7 +426,7 @@ void Duplo::Run(const Options& options) { << std::endl; } else { outfile - << "Configuration: " + << "Configuration:" << std::endl << " Number of files: " << files @@ -444,7 +444,7 @@ void Duplo::Run(const Options& options) { << options.GetIgnoreSameFilename() << std::endl << std::endl - << "Results: " + << "Results:" << std::endl << " Lines of code: " << locsTotal diff --git a/src/FileTypeBase.cpp b/src/FileTypeBase.cpp index 81a00c6..717eea4 100644 --- a/src/FileTypeBase.cpp +++ b/src/FileTypeBase.cpp @@ -35,7 +35,7 @@ bool FileTypeBase::IsSourceLine(const std::string& line) const { std::vector FileTypeBase::GetCleanedSourceLines(const std::vector& lines) const { auto lineFilter = CreateLineFilter(); std::vector filteredLines; - for (std::vector::size_type i = 0; i < lines.size(); i++) { + for (auto i = 0; i < lines.size(); i++) { auto filteredLine = GetCleanLine(lineFilter->ProcessSourceLine(lines[i])); if (IsSourceLine(filteredLine)) { filteredLines.emplace_back(filteredLine, i); diff --git a/src/SourceLine.cpp b/src/SourceLine.cpp index 3845725..40f662f 100644 --- a/src/SourceLine.cpp +++ b/src/SourceLine.cpp @@ -1,8 +1,8 @@ #include "SourceLine.h" #include "HashUtil.h" #include "SourceFile.h" - -#include + +#include SourceLine::SourceLine(const std::string& line, int lineNumber) { m_line = line; @@ -11,16 +11,16 @@ SourceLine::SourceLine(const std::string& line, int lineNumber) { std::string cleanLine; // Remove all white space and noise (tabs etc) - std::copy_if( - std::begin(line), - std::end(line), - std::back_inserter(cleanLine), + std::copy_if( + std::begin(line), + std::end(line), + std::back_inserter(cleanLine), [](char c) { return c > ' '; }); m_hash = HashUtil::Hash(cleanLine.c_str(), cleanLine.size()); } int SourceLine::GetLineNumber() const { - return m_lineNumber; + return m_lineNumber + 1; } bool SourceLine::operator==(const SourceLine& other) const { @@ -30,7 +30,7 @@ bool SourceLine::operator==(const SourceLine& other) const { const std::string& SourceLine::GetLine() const { return m_line; } - + unsigned long SourceLine::GetHash() const { return m_hash; } diff --git a/tests/Simple/test-xml.bats b/tests/Simple/test-xml.bats new file mode 100644 index 0000000..413b26b --- /dev/null +++ b/tests/Simple/test-xml.bats @@ -0,0 +1,29 @@ +setup() { + run ./build/duplo -xml tests/Simple/LineNumbers.lst out.xml +} + +@test "LineNumbers.c" { + [ "$status" -eq 0 ] + [ "${lines[0]}" = "Loading and hashing files ... 2 done." ] + [ "${lines[1]}" = "tests/Simple/LineNumbers.c found: 1 block(s)" ] +} + +@test "LineNumbers.c out.xml" { + run cat out.xml + printf 'Lines:\n' + printf 'lines %s\n' "${lines[@]}" >&2 + [ "${lines[0]}" = "" ] + [ "${lines[1]}" = "" ] + [ "${lines[2]}" = " " ] + [ "${lines[3]}" = " " ] + [ "${lines[4]}" = " " ] + [ "${lines[5]}" = " " ] + [ "${lines[6]}" = " " ] + [ "${lines[7]}" = " " ] + [ "${lines[8]}" = " " ] + [ "${lines[9]}" = " " ] + [ "${lines[10]}" = " " ] + [ "${lines[11]}" = " " ] + [ "${lines[12]}" = " " ] + [ "${lines[13]}" = "" ] +} diff --git a/tests/Simple/tests.bats b/tests/Simple/tests.bats index 3ec9f98..ee86657 100644 --- a/tests/Simple/tests.bats +++ b/tests/Simple/tests.bats @@ -1,9 +1,33 @@ +setup() { + run ./build/duplo tests/Simple/LineNumbers.lst out.txt +} + @test "LineNumbers.c" { - run ./build/duplo tests/Simple/LineNumbers.lst out.txt && cat out.txt - printf 'lines %s\n' "${lines[@]}" >&2 - echo "status = $status" [ "$status" -eq 0 ] [ "${lines[0]}" = "Loading and hashing files ... 2 done." ] [ "${lines[1]}" = "tests/Simple/LineNumbers.c found: 1 block(s)" ] - [ "${lines[1]}" = "tests/Simple/LineNumbers.c(6)" ] +} + +@test "LineNumbers.c out.txt" { + run cat out.txt + printf 'Lines:\n' + printf 'lines %s\n' "${lines[@]}" >&2 + printf 'output %s\n' "${output[@]}" >&2 + [ "${lines[0]}" = "tests/Simple/LineNumbers.c(7)" ] + [ "${lines[1]}" = "tests/Simple/LineNumbers.c(1)" ] + [ "${lines[2]}" = "AAAAA" ] + [ "${lines[3]}" = "BBBBB" ] + [ "${lines[4]}" = "CCCCC" ] + [ "${lines[5]}" = "DDDDD" ] + [ "${lines[6]}" = "EEEEE" ] + [ "${lines[7]}" = "Configuration:" ] + [ "${lines[8]}" = " Number of files: 1" ] + [ "${lines[9]}" = " Minimal block size: 4" ] + [ "${lines[10]}" = " Minimal characters in line: 3" ] + [ "${lines[11]}" = " Ignore preprocessor directives: 0" ] + [ "${lines[12]}" = " Ignore same filenames: 0" ] + [ "${lines[13]}" = "Results:" ] + [ "${lines[14]}" = " Lines of code: 11" ] + [ "${lines[15]}" = " Duplicate lines of code: 5" ] + [ "${lines[16]}" = " Total 1 duplicate block(s) found." ] } From 5266c03ea0d7c311a6f0e54b2eb07877c9d442fe Mon Sep 17 00:00:00 2001 From: Daniel Lidstrom Date: Sat, 15 Aug 2020 02:33:12 +0200 Subject: [PATCH 4/5] correct data type --- src/FileTypeBase.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/FileTypeBase.cpp b/src/FileTypeBase.cpp index 717eea4..81a00c6 100644 --- a/src/FileTypeBase.cpp +++ b/src/FileTypeBase.cpp @@ -35,7 +35,7 @@ bool FileTypeBase::IsSourceLine(const std::string& line) const { std::vector FileTypeBase::GetCleanedSourceLines(const std::vector& lines) const { auto lineFilter = CreateLineFilter(); std::vector filteredLines; - for (auto i = 0; i < lines.size(); i++) { + for (std::vector::size_type i = 0; i < lines.size(); i++) { auto filteredLine = GetCleanLine(lineFilter->ProcessSourceLine(lines[i])); if (IsSourceLine(filteredLine)) { filteredLines.emplace_back(filteredLine, i); From f72b1b16054a9bb4d222ae09bc1e2cde11aeb625 Mon Sep 17 00:00:00 2001 From: Daniel Lidstrom Date: Sat, 15 Aug 2020 02:39:36 +0200 Subject: [PATCH 5/5] output ending line number - this is the unfiltered line number (original) --- src/Duplo.cpp | 34 ++++++++++++++++++++++++++++------ tests/Simple/LineNumbers.c | 1 + tests/Simple/test-xml.bats | 4 ++-- watch.sh | 2 +- 4 files changed, 32 insertions(+), 9 deletions(-) diff --git a/src/Duplo.cpp b/src/Duplo.cpp index 9df1291..82d2cc7 100644 --- a/src/Duplo.cpp +++ b/src/Duplo.cpp @@ -172,10 +172,26 @@ namespace { std::ostream& outFile) { unsigned duplicateLines = 0; if (xml) { - outFile << " " << std::endl; - outFile << " " << std::endl; - outFile << " " << std::endl; - outFile << " " << std::endl; + outFile + << " " + << std::endl; + int startLineNumber1 = source1.GetLine(line1).GetLineNumber(); + int endLineNumber1 = source1.GetLine(line1 + count).GetLineNumber(); + outFile + << " " + << std::endl; + int startLineNumber2 = source2.GetLine(line2).GetLineNumber(); + int endLineNumber2 = source2.GetLine(line2 + count).GetLineNumber(); + outFile + << " " + << std::endl; + outFile + << " " + << std::endl; for (int j = 0; j < count; j++) { // replace various characters/ strings so that it doesn't upset the XML parser std::string tmpstr = source1.GetLine(j + line1).GetLine(); @@ -199,8 +215,14 @@ namespace { outFile << " " << std::endl; outFile << " " << std::endl; } else { - outFile << source1.GetFilename() << "(" << source1.GetLine(line1).GetLineNumber() << ")" << std::endl; - outFile << source2.GetFilename() << "(" << source2.GetLine(line2).GetLineNumber() << ")" << std::endl; + outFile + << source1.GetFilename() + << "(" << source1.GetLine(line1).GetLineNumber() << ")" + << std::endl; + outFile + << source2.GetFilename() + << "(" << source2.GetLine(line2).GetLineNumber() << ")" + << std::endl; for (int j = 0; j < count; j++) { outFile << source1.GetLine(j + line1).GetLine() << std::endl; duplicateLines++; diff --git a/tests/Simple/LineNumbers.c b/tests/Simple/LineNumbers.c index 063f33d..e802829 100644 --- a/tests/Simple/LineNumbers.c +++ b/tests/Simple/LineNumbers.c @@ -7,6 +7,7 @@ EEEEE AAAAA BBBBB CCCCC +// skip this line DDDDD EEEEE FFFFF diff --git a/tests/Simple/test-xml.bats b/tests/Simple/test-xml.bats index 413b26b..f371011 100644 --- a/tests/Simple/test-xml.bats +++ b/tests/Simple/test-xml.bats @@ -15,8 +15,8 @@ setup() { [ "${lines[0]}" = "" ] [ "${lines[1]}" = "" ] [ "${lines[2]}" = " " ] - [ "${lines[3]}" = " " ] - [ "${lines[4]}" = " " ] + [ "${lines[3]}" = " " ] + [ "${lines[4]}" = " " ] [ "${lines[5]}" = " " ] [ "${lines[6]}" = " " ] [ "${lines[7]}" = " " ] diff --git a/watch.sh b/watch.sh index 33b14b5..b4e943a 100755 --- a/watch.sh +++ b/watch.sh @@ -11,7 +11,7 @@ fswatch \ --extended \ --latency 0.5 \ --exclude ".*" \ - --include "Makefile|(\.(h|cpp|bats)$)" . | + --include "Makefile|(\.(h|cpp|bats|c)$)" . | while read line; do p "new changes received: $line"