Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[#64] Copy paste detection in lists #102

Draft
wants to merge 7 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/Xrefcheck/Core.hs
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ data FileInfo = FileInfo
}
deriving stock (Show, Generic)
deriving anyclass NFData

makeLenses ''FileInfo

instance Default FileInfo where
Expand Down
30 changes: 30 additions & 0 deletions src/Xrefcheck/Scanners/Markdown.hs
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,13 @@ module Xrefcheck.Scanners.Markdown
import Universum

import CMarkGFM (Node (..), NodeType (..), PosInfo (..), commonmarkToNode)
import Control.Lens hiding ((^?))
import Control.Monad.Except (MonadError, throwError)
import Data.Aeson.TH (deriveFromJSON)
import Data.ByteString.Lazy qualified as BSL
import Data.DList qualified as DList
import Data.Default (def)
import Data.List (isSubsequenceOf)
import Data.Text qualified as T
import Data.Text.Lazy qualified as LT
import Fmt (Buildable (..), blockListF, nameF, (+|), (|+))
Expand All @@ -37,6 +39,8 @@ data MarkdownConfig = MarkdownConfig

deriveFromJSON aesonConfigOption ''MarkdownConfig

makePrisms ''NodeType

defGithubMdConfig :: MarkdownConfig
defGithubMdConfig = MarkdownConfig
{ mcFlavor = GitHub
Expand Down Expand Up @@ -176,8 +180,34 @@ nodeExtractInfo input@(Node _ _ nSubs) = do
_ -> return mempty

copyPaste :: Node -> m FileInfoDiff
copyPaste (Node _ (LIST _) nodes) = do
case items of
top : rest | urlIsASubsequence top -> do
let bad = filter (not . urlIsASubsequence) rest
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

According to the acceptance criteria, we want to report copy-paste if there are two links [T1](L1) and [T2](L1) in a file, and T1 is substring of L1 modulo casing and all the non-alphanum characters, while T2 is not substring of L1 modulo the same things;

However, AFAIU here you check that the first item satisfy the law -- T1 is a subsequence of L1 -- and report all other links from the list that doesn't satisfy as invalid, so [Tn](Ln) would also be reported if Tn isn't a subsequence of Ln, e.g.:

- [Foo Bar](foo-bar) e
- [Foo Qux](foo-qux) e
- [Foo Kek](foo-kek) e
- [Just text](file) e

produces

  ➥  In file tests/markdowns/without-annotations/copy/copy-paste.md
     bad reference (local) at src:18:4-12:
       - text: ""
       - link:
       - anchor: -

     ⛂  Possibly incorrect copy-paste in list with references
        the url is file
           but the text is Just text

I am not sure if we need to consider such list items as bad copy-paste, I believe it would be better to don't take them into account at all and report only those which strictly satisfy the law from acceptance criteria -- there are two links [T1](L1) and [T2](L1) in a file, and T1 is substring of L1 modulo casing and all the non-alphanum characters, while T2 is not substring of L1 modulo the same things;

pure mempty { _fidCopyPastes = DList.fromList bad }
_ -> do
pure mempty
where
items = do
(_, nodes', _) <- takeOnly _ITEM nodes
(_, nodes'', _) <- takeOnly _PARAGRAPH nodes'
take 1 $ do
(_, texts, (url, _)) <- takeOnly _LINK nodes''
(pos, _, txt) <- take 1 $ takeOnly _TEXT texts
return (CopyPaste url txt (toPosition pos))

copyPaste _ = pure mempty

takeOnly prizm list = do
Node pos hdr nodes <- list
case hdr^?prizm of
Just res -> return (pos, nodes, res)
Nothing -> []

urlIsASubsequence :: CopyPaste -> Bool
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nitpick: AFAICS, here we check if link text is a subsequence of link URL, so if name is a URL subsequence, not URL is a subsequence of name

urlIsASubsequence paste =
T.unpack (cpAnchorText paste) `isSubsequenceOf` T.unpack (cpPlainText paste)

merge :: (Monad m, Monoid b) => [a -> m b] -> a -> m b
merge fs a = mconcat <$> traverse ($ a) fs

Expand Down
25 changes: 19 additions & 6 deletions src/Xrefcheck/Verify.hs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import Universum
import Control.Concurrent.Async (wait, withAsync)
import Control.Exception (throwIO)
import Control.Monad.Except (MonadError (..))
import Data.Bits (toIntegralSized)
import Data.ByteString qualified as BS
import Data.Map qualified as M
import Data.Text qualified as T
Expand All @@ -49,7 +50,6 @@ import Text.Regex.TDFA.Text (Regex, regexec)
import Text.URI (Authority (..), URI (..), mkURI)
import Time (RatioNat, Second, Time (..), ms, threadDelay, timeout)

import Data.Bits (toIntegralSized)
import Xrefcheck.Config
import Xrefcheck.Core
import Xrefcheck.Orphans ()
Expand All @@ -70,9 +70,7 @@ deriving newtype instance Semigroup (VerifyResult e)
deriving newtype instance Monoid (VerifyResult e)

instance Buildable e => Buildable (VerifyResult e) where
build vr = case verifyErrors vr of
Nothing -> "ok"
Just errs -> listF errs
build vr = maybe "ok" listF (verifyErrors vr)

verifyOk :: VerifyResult e -> Bool
verifyOk (VerifyResult errors) = null errors
Expand Down Expand Up @@ -114,6 +112,7 @@ data VerifyError
| ExternalFtpException FTPException
| FtpEntryDoesNotExist FilePath
| ExternalResourceSomeError Text
| PossiblyIncorrectCopyPaste Text Text
deriving stock (Show, Eq)

instance Buildable VerifyError where
Expand Down Expand Up @@ -156,10 +155,15 @@ instance Buildable VerifyError where
"⛂ FTP exception (" +| err |+ ")\n"

FtpEntryDoesNotExist entry ->
"⛂ File or directory does not exist:\n" +| entry |+ "\n"
"⛂ File or directory does not exist:\n" +| entry |+ "\n"

ExternalResourceSomeError err ->
"⛂ " +| build err |+ "\n\n"

PossiblyIncorrectCopyPaste url text ->
"⛂ Possibly incorrect copy-paste in list with references\n" +|
" the url is " +| build url |+ "\n " +|
" but the text is " +| build text |+ "\n\n"
where
anchorHints = \case
[] -> "\n"
Expand Down Expand Up @@ -219,10 +223,19 @@ verifyRepo

progressRef <- newIORef $ initVerifyProgress (map snd toScan)

errorss <- for (M.toList repoInfo) $ \(file, info) -> do
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here and below

Suggested change
errorss <- for (M.toList repoInfo) $ \(file, info) -> do
errors <- for (M.toList repoInfo) $ \(file, info) -> do

let pasta = _fiCopyPastes info
return
$ VerifyResult
$ fmap (\(CopyPaste url txt pos) ->
WithReferenceLoc file (Reference "" "" Nothing pos)
$ PossiblyIncorrectCopyPaste url txt)
pasta

accumulated <- withAsync (printer progressRef) $ \_ ->
forConcurrentlyCaching toScan ifExternalThenCache $ \(file, ref) ->
verifyReference config mode progressRef repoInfo' root file ref
return $ fold accumulated
return $ fold errorss <> fold accumulated
where
printer progressRef = forever $ do
readIORef progressRef >>= reprintAnalyseProgress rw mode
Expand Down
24 changes: 24 additions & 0 deletions tests/Test/Xrefcheck/CopyPasteInListsSpec.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{- SPDX-FileCopyrightText: 2019 Serokell <https://serokell.io>
-
- SPDX-License-Identifier: MPL-2.0
-}

module Test.Xrefcheck.CopyPasteInListsSpec where

import Universum

import Test.Hspec (Spec, describe, it, shouldBe)

import Test.Xrefcheck.Util
import Xrefcheck.Core

spec :: Spec
spec = do
describe "Possibly incorrect copy-paste" $ do
for_ allFlavors $ \fl -> do
it ("is detected (" <> show fl <> ")") $ do
fi <- getFI fl "tests/markdowns/without-annotations/copy-paste_in_lists.md"
getPasta fi `shouldBe`[("a", "c")]
where
getPasta :: FileInfo -> [(Text, Text)]
getPasta fi = map (cpAnchorText &&& cpPlainText) $ fi ^. fiCopyPastes
17 changes: 17 additions & 0 deletions tests/markdowns/without-annotations/copy-paste_in_lists.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<!--
- SPDX-FileCopyrightText: 2018-2019 Serokell <https://serokell.io>
-
- SPDX-License-Identifier: MPL-2.0
-->

A list with bad copy-paste:

- [a](a) e
- [b](b) e
- [c](a) e

A list that is completely fine:

- [a](a) d
- [b](b) d
- [c](c) d