-
Notifications
You must be signed in to change notification settings - Fork 3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[#64] Copy paste detection in lists #102
base: master
Are you sure you want to change the base?
Changes from 1 commit
317e3a6
e16377b
681c6b1
dfda710
f60aece
94748c5
034bef5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,11 +18,13 @@ module Xrefcheck.Scanners.Markdown | |
import Universum | ||
|
||
import CMarkGFM (Node (..), NodeType (..), PosInfo (..), commonmarkToNode) | ||
import Control.Lens hiding ((^?)) | ||
import Control.Monad.Except (MonadError, throwError) | ||
import Data.Aeson.TH (deriveFromJSON) | ||
import Data.ByteString.Lazy qualified as BSL | ||
import Data.DList qualified as DList | ||
import Data.Default (def) | ||
import Data.List (isSubsequenceOf) | ||
import Data.Text qualified as T | ||
import Data.Text.Lazy qualified as LT | ||
import Fmt (Buildable (..), blockListF, nameF, (+|), (|+)) | ||
|
@@ -37,6 +39,8 @@ data MarkdownConfig = MarkdownConfig | |
|
||
deriveFromJSON aesonConfigOption ''MarkdownConfig | ||
|
||
makePrisms ''NodeType | ||
|
||
defGithubMdConfig :: MarkdownConfig | ||
defGithubMdConfig = MarkdownConfig | ||
{ mcFlavor = GitHub | ||
|
@@ -176,8 +180,34 @@ nodeExtractInfo input@(Node _ _ nSubs) = do | |
_ -> return mempty | ||
|
||
copyPaste :: Node -> m FileInfoDiff | ||
copyPaste (Node _ (LIST _) nodes) = do | ||
case items of | ||
top : rest | urlIsASubsequence top -> do | ||
let bad = filter (not . urlIsASubsequence) rest | ||
pure mempty { _fidCopyPastes = DList.fromList bad } | ||
_ -> do | ||
pure mempty | ||
where | ||
items = do | ||
(_, nodes', _) <- takeOnly _ITEM nodes | ||
(_, nodes'', _) <- takeOnly _PARAGRAPH nodes' | ||
take 1 $ do | ||
(_, texts, (url, _)) <- takeOnly _LINK nodes'' | ||
(pos, _, txt) <- take 1 $ takeOnly _TEXT texts | ||
return (CopyPaste url txt (toPosition pos)) | ||
|
||
copyPaste _ = pure mempty | ||
|
||
takeOnly prizm list = do | ||
Node pos hdr nodes <- list | ||
case hdr^?prizm of | ||
Just res -> return (pos, nodes, res) | ||
Nothing -> [] | ||
|
||
urlIsASubsequence :: CopyPaste -> Bool | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nitpick: AFAICS, here we check if link text is a subsequence of link URL, so if name is a URL subsequence, not URL is a subsequence of name |
||
urlIsASubsequence paste = | ||
T.unpack (cpAnchorText paste) `isSubsequenceOf` T.unpack (cpPlainText paste) | ||
|
||
merge :: (Monad m, Monoid b) => [a -> m b] -> a -> m b | ||
merge fs a = mconcat <$> traverse ($ a) fs | ||
|
||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -27,6 +27,7 @@ import Universum | |||||
import Control.Concurrent.Async (wait, withAsync) | ||||||
import Control.Exception (throwIO) | ||||||
import Control.Monad.Except (MonadError (..)) | ||||||
import Data.Bits (toIntegralSized) | ||||||
import Data.ByteString qualified as BS | ||||||
import Data.Map qualified as M | ||||||
import Data.Text qualified as T | ||||||
|
@@ -49,7 +50,6 @@ import Text.Regex.TDFA.Text (Regex, regexec) | |||||
import Text.URI (Authority (..), URI (..), mkURI) | ||||||
import Time (RatioNat, Second, Time (..), ms, threadDelay, timeout) | ||||||
|
||||||
import Data.Bits (toIntegralSized) | ||||||
import Xrefcheck.Config | ||||||
import Xrefcheck.Core | ||||||
import Xrefcheck.Orphans () | ||||||
|
@@ -70,9 +70,7 @@ deriving newtype instance Semigroup (VerifyResult e) | |||||
deriving newtype instance Monoid (VerifyResult e) | ||||||
|
||||||
instance Buildable e => Buildable (VerifyResult e) where | ||||||
build vr = case verifyErrors vr of | ||||||
Nothing -> "ok" | ||||||
Just errs -> listF errs | ||||||
build vr = maybe "ok" listF (verifyErrors vr) | ||||||
|
||||||
verifyOk :: VerifyResult e -> Bool | ||||||
verifyOk (VerifyResult errors) = null errors | ||||||
|
@@ -114,6 +112,7 @@ data VerifyError | |||||
| ExternalFtpException FTPException | ||||||
| FtpEntryDoesNotExist FilePath | ||||||
| ExternalResourceSomeError Text | ||||||
| PossiblyIncorrectCopyPaste Text Text | ||||||
deriving stock (Show, Eq) | ||||||
|
||||||
instance Buildable VerifyError where | ||||||
|
@@ -156,10 +155,15 @@ instance Buildable VerifyError where | |||||
"⛂ FTP exception (" +| err |+ ")\n" | ||||||
|
||||||
FtpEntryDoesNotExist entry -> | ||||||
"⛂ File or directory does not exist:\n" +| entry |+ "\n" | ||||||
"⛂ File or directory does not exist:\n" +| entry |+ "\n" | ||||||
|
||||||
ExternalResourceSomeError err -> | ||||||
"⛂ " +| build err |+ "\n\n" | ||||||
|
||||||
PossiblyIncorrectCopyPaste url text -> | ||||||
"⛂ Possibly incorrect copy-paste in list with references\n" +| | ||||||
" the url is " +| build url |+ "\n " +| | ||||||
" but the text is " +| build text |+ "\n\n" | ||||||
where | ||||||
anchorHints = \case | ||||||
[] -> "\n" | ||||||
|
@@ -219,10 +223,19 @@ verifyRepo | |||||
|
||||||
progressRef <- newIORef $ initVerifyProgress (map snd toScan) | ||||||
|
||||||
errorss <- for (M.toList repoInfo) $ \(file, info) -> do | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here and below
Suggested change
|
||||||
let pasta = _fiCopyPastes info | ||||||
return | ||||||
$ VerifyResult | ||||||
$ fmap (\(CopyPaste url txt pos) -> | ||||||
WithReferenceLoc file (Reference "" "" Nothing pos) | ||||||
$ PossiblyIncorrectCopyPaste url txt) | ||||||
pasta | ||||||
|
||||||
accumulated <- withAsync (printer progressRef) $ \_ -> | ||||||
forConcurrentlyCaching toScan ifExternalThenCache $ \(file, ref) -> | ||||||
verifyReference config mode progressRef repoInfo' root file ref | ||||||
return $ fold accumulated | ||||||
return $ fold errorss <> fold accumulated | ||||||
where | ||||||
printer progressRef = forever $ do | ||||||
readIORef progressRef >>= reprintAnalyseProgress rw mode | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
{- SPDX-FileCopyrightText: 2019 Serokell <https://serokell.io> | ||
- | ||
- SPDX-License-Identifier: MPL-2.0 | ||
-} | ||
|
||
module Test.Xrefcheck.CopyPasteInListsSpec where | ||
|
||
import Universum | ||
|
||
import Test.Hspec (Spec, describe, it, shouldBe) | ||
|
||
import Test.Xrefcheck.Util | ||
import Xrefcheck.Core | ||
|
||
spec :: Spec | ||
spec = do | ||
describe "Possibly incorrect copy-paste" $ do | ||
for_ allFlavors $ \fl -> do | ||
it ("is detected (" <> show fl <> ")") $ do | ||
fi <- getFI fl "tests/markdowns/without-annotations/copy-paste_in_lists.md" | ||
getPasta fi `shouldBe`[("a", "c")] | ||
where | ||
getPasta :: FileInfo -> [(Text, Text)] | ||
getPasta fi = map (cpAnchorText &&& cpPlainText) $ fi ^. fiCopyPastes |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
<!-- | ||
- SPDX-FileCopyrightText: 2018-2019 Serokell <https://serokell.io> | ||
- | ||
- SPDX-License-Identifier: MPL-2.0 | ||
--> | ||
|
||
A list with bad copy-paste: | ||
|
||
- [a](a) e | ||
- [b](b) e | ||
- [c](a) e | ||
|
||
A list that is completely fine: | ||
|
||
- [a](a) d | ||
- [b](b) d | ||
- [c](c) d |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
According to the acceptance criteria, we want to report copy-paste if
there are two links [T1](L1) and [T2](L1) in a file, and T1 is substring of L1 modulo casing and all the non-alphanum characters, while T2 is not substring of L1 modulo the same things;
However, AFAIU here you check that the first item satisfy the law --
T1
is a subsequence ofL1
-- and report all other links from the list that doesn't satisfy as invalid, so[Tn](Ln)
would also be reported ifTn
isn't a subsequence ofLn
, e.g.:produces
I am not sure if we need to consider such list items as bad copy-paste, I believe it would be better to don't take them into account at all and report only those which strictly satisfy the law from acceptance criteria --
there are two links [T1](L1) and [T2](L1) in a file, and T1 is substring of L1 modulo casing and all the non-alphanum characters, while T2 is not substring of L1 modulo the same things;