From 0424751b2d1821bc11a2b28e1d27f1c8e9236015 Mon Sep 17 00:00:00 2001 From: Tomaz Bratanic Date: Tue, 2 Oct 2018 22:10:54 +0200 Subject: [PATCH 1/2] Fix the write boolean for jaccard index --- .../main/java/org/neo4j/graphalgo/similarity/JaccardProc.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/algo/src/main/java/org/neo4j/graphalgo/similarity/JaccardProc.java b/algo/src/main/java/org/neo4j/graphalgo/similarity/JaccardProc.java index 0cd351c50..4c5e0880f 100644 --- a/algo/src/main/java/org/neo4j/graphalgo/similarity/JaccardProc.java +++ b/algo/src/main/java/org/neo4j/graphalgo/similarity/JaccardProc.java @@ -60,7 +60,7 @@ public Stream jaccard( double similarityCutoff = getSimilarityCutoff(configuration); Stream stream = topN(similarityStream(inputs, computer, configuration, similarityCutoff, getTopK(configuration)), getTopN(configuration)); - boolean write = configuration.isWriteFlag(false) && similarityCutoff > 0.0; + boolean write = configuration.isWriteFlag(false); return writeAndAggregateResults(configuration, stream, inputs.length, write, "SIMILAR"); } From 712719c35bfd7fca51479ba94a4f355f12490c3f Mon Sep 17 00:00:00 2001 From: Tomaz Bratanic Date: Sun, 7 Oct 2018 20:51:29 +0200 Subject: [PATCH 2/2] Change default similarityCutoff value --- .../org/neo4j/graphalgo/similarity/CosineProc.java | 4 ++-- .../neo4j/graphalgo/similarity/EuclideanProc.java | 4 ++-- .../org/neo4j/graphalgo/similarity/JaccardProc.java | 6 +++--- .../org/neo4j/graphalgo/similarity/OverlapProc.java | 4 ++-- .../neo4j/graphalgo/similarity/SimilarityProc.java | 2 +- doc/asciidoc/similarity-cosine.adoc | 4 ++-- doc/asciidoc/similarity-jaccard.adoc | 4 ++-- doc/asciidoc/similarity-overlap.adoc | 4 ++-- .../neo4j/graphalgo/algo/similarity/CosineTest.java | 12 ++++++------ .../neo4j/graphalgo/algo/similarity/JaccardTest.java | 8 ++++---- .../neo4j/graphalgo/algo/similarity/OverlapTest.java | 4 ++-- 11 files changed, 28 insertions(+), 28 deletions(-) diff --git a/algo/src/main/java/org/neo4j/graphalgo/similarity/CosineProc.java b/algo/src/main/java/org/neo4j/graphalgo/similarity/CosineProc.java index 11eb7f4ea..a1487e426 100644 --- a/algo/src/main/java/org/neo4j/graphalgo/similarity/CosineProc.java +++ b/algo/src/main/java/org/neo4j/graphalgo/similarity/CosineProc.java @@ -31,7 +31,7 @@ public class CosineProc extends SimilarityProc { @Procedure(name = "algo.similarity.cosine.stream", mode = Mode.READ) - @Description("CALL algo.similarity.cosine.stream([{item:id, weights:[weights]}], {similarityCutoff:-1,degreeCutoff:0}) " + + @Description("CALL algo.similarity.cosine.stream([{item:id, weights:[weights]}], {similarityCutoff:0.1,degreeCutoff:0}) " + "YIELD item1, item2, count1, count2, intersection, similarity - computes cosine distance") // todo count1,count2 = could be the non-null values, intersection the values where both are non-null? public Stream cosineStream( @@ -57,7 +57,7 @@ public Stream cosineStream( } @Procedure(name = "algo.similarity.cosine", mode = Mode.WRITE) - @Description("CALL algo.similarity.cosine([{item:id, weights:[weights]}], {similarityCutoff:-1,degreeCutoff:0}) " + + @Description("CALL algo.similarity.cosine([{item:id, weights:[weights]}], {similarityCutoff:0.1,degreeCutoff:0}) " + "YIELD p50, p75, p90, p99, p999, p100 - computes cosine similarities") public Stream cosine( @Name(value = "data", defaultValue = "null") List> data, diff --git a/algo/src/main/java/org/neo4j/graphalgo/similarity/EuclideanProc.java b/algo/src/main/java/org/neo4j/graphalgo/similarity/EuclideanProc.java index 0d1c273e1..3027cea82 100644 --- a/algo/src/main/java/org/neo4j/graphalgo/similarity/EuclideanProc.java +++ b/algo/src/main/java/org/neo4j/graphalgo/similarity/EuclideanProc.java @@ -44,7 +44,7 @@ public Stream euclideanStream( WeightedInput[] inputs = prepareWeights(data, getDegreeCutoff(configuration)); - double similarityCutoff = getSimilarityCutoff(configuration); + double similarityCutoff = configuration.get("similarityCutoff", -1.0D); // as we don't compute the sqrt until the end if (similarityCutoff > 0d) similarityCutoff *= similarityCutoff; @@ -69,7 +69,7 @@ public Stream euclidean( WeightedInput[] inputs = prepareWeights(data, getDegreeCutoff(configuration)); - double similarityCutoff = getSimilarityCutoff(configuration); + double similarityCutoff = configuration.get("similarityCutoff", -1.0D); // as we don't compute the sqrt until the end if (similarityCutoff > 0d) similarityCutoff *= similarityCutoff; diff --git a/algo/src/main/java/org/neo4j/graphalgo/similarity/JaccardProc.java b/algo/src/main/java/org/neo4j/graphalgo/similarity/JaccardProc.java index 4c5e0880f..64a002956 100644 --- a/algo/src/main/java/org/neo4j/graphalgo/similarity/JaccardProc.java +++ b/algo/src/main/java/org/neo4j/graphalgo/similarity/JaccardProc.java @@ -29,7 +29,7 @@ public class JaccardProc extends SimilarityProc { @Procedure(name = "algo.similarity.jaccard.stream", mode = Mode.READ) - @Description("CALL algo.similarity.jaccard.stream([{item:id, categories:[ids]}], {similarityCutoff:-1,degreeCutoff:0}) " + + @Description("CALL algo.similarity.jaccard.stream([{item:id, categories:[ids]}], {similarityCutoff:0.1,degreeCutoff:0}) " + "YIELD item1, item2, count1, count2, intersection, similarity - computes jaccard similarities") public Stream similarityStream( @Name(value = "data", defaultValue = "null") List> data, @@ -45,7 +45,7 @@ public Stream similarityStream( } @Procedure(name = "algo.similarity.jaccard", mode = Mode.WRITE) - @Description("CALL algo.similarity.jaccard([{item:id, categories:[ids]}], {similarityCutoff:-1,degreeCutoff:0}) " + + @Description("CALL algo.similarity.jaccard([{item:id, categories:[ids]}], {similarityCutoff:0.1,degreeCutoff:0}) " + "YIELD p50, p75, p90, p99, p999, p100 - computes jaccard similarities") public Stream jaccard( @Name(value = "data", defaultValue = "null") List> data, @@ -60,7 +60,7 @@ public Stream jaccard( double similarityCutoff = getSimilarityCutoff(configuration); Stream stream = topN(similarityStream(inputs, computer, configuration, similarityCutoff, getTopK(configuration)), getTopN(configuration)); - boolean write = configuration.isWriteFlag(false); + boolean write = configuration.isWriteFlag(false) && similarityCutoff > 0.0; return writeAndAggregateResults(configuration, stream, inputs.length, write, "SIMILAR"); } diff --git a/algo/src/main/java/org/neo4j/graphalgo/similarity/OverlapProc.java b/algo/src/main/java/org/neo4j/graphalgo/similarity/OverlapProc.java index 07b534902..7d4b6a0de 100644 --- a/algo/src/main/java/org/neo4j/graphalgo/similarity/OverlapProc.java +++ b/algo/src/main/java/org/neo4j/graphalgo/similarity/OverlapProc.java @@ -31,7 +31,7 @@ public class OverlapProc extends SimilarityProc { @Procedure(name = "algo.similarity.overlap.stream", mode = Mode.READ) - @Description("CALL algo.similarity.overlap.stream([{item:id, targets:[ids]}], {similarityCutoff:-1,degreeCutoff:0}) " + + @Description("CALL algo.similarity.overlap.stream([{item:id, categories:[ids]}], {similarityCutoff:0.1,degreeCutoff:0}) " + "YIELD item1, item2, count1, count2, intersection, similarity - computes overlap similarities") public Stream similarityStream( @Name(value = "data", defaultValue = "null") List> data, @@ -47,7 +47,7 @@ public Stream similarityStream( } @Procedure(name = "algo.similarity.overlap", mode = Mode.WRITE) - @Description("CALL algo.similarity.overlap([{item:id, targets:[ids]}], {similarityCutoff:-1,degreeCutoff:0}) " + + @Description("CALL algo.similarity.overlap([{item:id, categories:[ids]}], {similarityCutoff:0.1,degreeCutoff:0}) " + "YIELD p50, p75, p90, p99, p999, p100 - computes overlap similarities") public Stream overlap( @Name(value = "data", defaultValue = "null") List> data, diff --git a/algo/src/main/java/org/neo4j/graphalgo/similarity/SimilarityProc.java b/algo/src/main/java/org/neo4j/graphalgo/similarity/SimilarityProc.java index 237c18643..bcca60193 100644 --- a/algo/src/main/java/org/neo4j/graphalgo/similarity/SimilarityProc.java +++ b/algo/src/main/java/org/neo4j/graphalgo/similarity/SimilarityProc.java @@ -88,7 +88,7 @@ Stream writeAndAggregateResults(ProcedureConfiguration } Double getSimilarityCutoff(ProcedureConfiguration configuration) { - return configuration.get("similarityCutoff", -1D); + return configuration.get("similarityCutoff", 0.1D); } Stream similarityStream(T[] inputs, SimilarityComputer computer, ProcedureConfiguration configuration, double cutoff, int topK) { diff --git a/doc/asciidoc/similarity-cosine.adoc b/doc/asciidoc/similarity-cosine.adoc index 8badd897b..d4a377b9e 100644 --- a/doc/asciidoc/similarity-cosine.adoc +++ b/doc/asciidoc/similarity-cosine.adoc @@ -163,7 +163,7 @@ For example, the person most similar to Praveena is Michael, but the person most | `data` | list | null | no | A list of maps of the following structure: `{item: nodeId, weights: [weight, weight, weight]}` | `top` | int | 0 | yes | The number of similar pairs to return. If `0`, it will return as many as it finds. | `topK` | int | 0 | yes | The number of similar values to return per node. If `0`, it will return as many as it finds. -| `similarityCutoff` | int | -1 | yes | The threshold for cosine similarity. Values below this will not be returned. +| `similarityCutoff` | int | 0.1 | yes | The threshold for cosine similarity. Values below this will not be returned. | `degreeCutoff` | int | 0 | yes | The threshold for the number of items in the `targets` list. If the list contains less than this amount, that node will be excluded from the calculation. | `concurrency` | int | available CPUs | yes | The number of concurrent threads. |=== @@ -220,7 +220,7 @@ include::scripts/similarity-cosine.cypher[tag=query] | `data` | list | null | no | A list of maps of the following structure: `{item: nodeId, categories: [nodeId, nodeId, nodeId]}` | `top` | int | 0 | yes | The number of similar pairs to return. If `0`, it will return as many as it finds. | `topK` | int | 0 | yes | The number of similar values to return per node. If `0`, it will return as many as it finds. -| `similarityCutoff` | int | -1 | yes | The threshold for Jaccard similarity. Values below this will not be returned. +| `similarityCutoff` | int | 0.1 | yes | The threshold for Jaccard similarity. Values below this will not be returned. | `degreeCutoff` | int | 0 | yes | The threshold for the number of items in the `targets` list. If the list contains less than this amount, that node will be excluded from the calculation. | `concurrency` | int | available CPUs | yes | The number of concurrent threads. | `write` | boolean | false | yes | Indicates whether results should be stored. diff --git a/doc/asciidoc/similarity-jaccard.adoc b/doc/asciidoc/similarity-jaccard.adoc index b281bd35e..4fd88f1c9 100644 --- a/doc/asciidoc/similarity-jaccard.adoc +++ b/doc/asciidoc/similarity-jaccard.adoc @@ -161,7 +161,7 @@ For example, the person most similar to Praveena is Zhen, but the person most si | `data` | list | null | no | A list of maps of the following structure: `{item: nodeId, categories: [nodeId, nodeId, nodeId]}` | `top` | int | 0 | yes | The number of similar pairs to return. If `0`, it will return as many as it finds. | `topK` | int | 0 | yes | The number of similar values to return per node. If `0`, it will return as many as it finds. -| `similarityCutoff` | int | -1 | yes | The threshold for Jaccard similarity. Values below this will not be returned. +| `similarityCutoff` | int | 0.1 | yes | The threshold for Jaccard similarity. Values below this will not be returned. | `degreeCutoff` | int | 0 | yes | The threshold for the number of items in the `targets` list. If the list contains less than this amount, that node will be excluded from the calculation. | `concurrency` | int | available CPUs | yes | The number of concurrent threads. |=== @@ -217,7 +217,7 @@ include::scripts/similarity-jaccard.cypher[tag=query] | `data` | list | null | no | A list of maps of the following structure: `{item: nodeId, categories: [nodeId, nodeId, nodeId]}` | `top` | int | 0 | yes | The number of similar pairs to return. If `0`, it will return as many as it finds. | `topK` | int | 0 | yes | The number of similar values to return per node. If `0`, it will return as many as it finds. -| `similarityCutoff` | int | -1 | yes | The threshold for Jaccard similarity. Values below this will not be returned. +| `similarityCutoff` | int | 0.1 | yes | The threshold for Jaccard similarity. Values below this will not be returned. | `degreeCutoff` | int | 0 | yes | The threshold for the number of items in the `targets` list. If the list contains less than this amount, that node will be excluded from the calculation. | `concurrency` | int | available CPUs | yes | The number of concurrent threads. | `write` | boolean | false | yes | Indicates whether results should be stored. diff --git a/doc/asciidoc/similarity-overlap.adoc b/doc/asciidoc/similarity-overlap.adoc index 2894626b6..ef21305ca 100644 --- a/doc/asciidoc/similarity-overlap.adoc +++ b/doc/asciidoc/similarity-overlap.adoc @@ -150,7 +150,7 @@ include::scripts/similarity-overlap.cypher[tag=stream-topk] | `data` | list | null | no | A list of maps of the following structure: `{item: nodeId, categories: [nodeId, nodeId, nodeId]}` | `top` | int | 0 | yes | The number of similar pairs to return. If `0`, it will return as many as it finds. | `topK` | int | 0 | yes | The number of similar values to return per node. If `0`, it will return as many as it finds. -| `similarityCutoff` | int | -1 | yes | The threshold for Overlap similarity. Values below this will not be returned. +| `similarityCutoff` | int | 0.1 | yes | The threshold for Overlap similarity. Values below this will not be returned. | `degreeCutoff` | int | 0 | yes | The threshold for the number of items in the `targets` list. If the list contains less than this amount, that node will be excluded from the calculation. | `concurrency` | int | available CPUs | yes | The number of concurrent threads. |=== @@ -208,7 +208,7 @@ include::scripts/similarity-overlap.cypher[tag=query] | `data` | list | null | no | A list of maps of the following structure: `{item: nodeId, categories: [nodeId, nodeId, nodeId]}` | `top` | int | 0 | yes | The number of similar pairs to return. If `0`, it will return as many as it finds. | `topK` | int | 0 | yes | The number of similar values to return per node. If `0`, it will return as many as it finds. -| `similarityCutoff` | int | -1 | yes | The threshold for Overlap similarity. Values below this will not be returned. +| `similarityCutoff` | int | 0.1 | yes | The threshold for Overlap similarity. Values below this will not be returned. | `degreeCutoff` | int | 0 | yes | The threshold for the number of items in the `targets` list. If the list contains less than this amount, that node will be excluded from the calculation. | `concurrency` | int | available CPUs | yes | The number of concurrent threads. | `write` | boolean | false | yes | Indicates whether results should be stored. diff --git a/tests/src/test/java/org/neo4j/graphalgo/algo/similarity/CosineTest.java b/tests/src/test/java/org/neo4j/graphalgo/algo/similarity/CosineTest.java index 46152d37c..10cd5eafe 100644 --- a/tests/src/test/java/org/neo4j/graphalgo/algo/similarity/CosineTest.java +++ b/tests/src/test/java/org/neo4j/graphalgo/algo/similarity/CosineTest.java @@ -187,7 +187,7 @@ public void cosineSingleMultiThreadComparisionTopK() { @Test public void topNcosineStreamTest() { - Result results = db.execute(STATEMENT_STREAM, map("config",map("top",2))); + Result results = db.execute(STATEMENT_STREAM, map("config",map("top",2,"similarityCutoff",-1.0))); assert01(results.next()); assert02(results.next()); assertFalse(results.hasNext()); @@ -195,7 +195,7 @@ public void topNcosineStreamTest() { @Test public void cosineStreamTest() { - Result results = db.execute(STATEMENT_STREAM, map("config",map("concurrency",1))); + Result results = db.execute(STATEMENT_STREAM, map("config",map("concurrency",1,"similarityCutoff",-1.0))); assertTrue(results.hasNext()); assert01(results.next()); assert02(results.next()); @@ -208,7 +208,7 @@ public void cosineStreamTest() { @Test public void topKCosineStreamTest() { - Map params = map("config", map( "concurrency", 1,"topK", 1)); + Map params = map("config", map( "concurrency", 1,"topK", 1, "similarityCutoff", -1.0)); System.out.println(db.execute(STATEMENT_STREAM, params).resultAsString()); Result results = db.execute(STATEMENT_STREAM, params); assertTrue(results.hasNext()); @@ -253,7 +253,7 @@ public void topK4cosineStreamTest() { @Test public void topK3cosineStreamTest() { - Map params = map("config", map("concurrency", 3, "topK", 3)); + Map params = map("config", map("concurrency", 3, "topK", 3, "similarityCutoff", -1.0)); System.out.println(db.execute(STATEMENT_STREAM, params).resultAsString()); @@ -267,7 +267,7 @@ public void topK3cosineStreamTest() { @Test public void simpleCosineTest() { - Map params = map("config", map()); + Map params = map("config", map("similarityCutoff",-1.0)); Map row = db.execute(STATEMENT,params).next(); assertEquals((double) row.get("p25"), 0.0, 0.01); @@ -283,7 +283,7 @@ public void simpleCosineTest() { public void simpleCosineFromEmbeddingTest() { db.execute(STORE_EMBEDDING_STATEMENT); - Map params = map("config", map()); + Map params = map("config", map("similarityCutoff",-1.0)); Map row = db.execute(EMBEDDING_STATEMENT,params).next(); assertEquals((double) row.get("p25"), 0.0, 0.01); diff --git a/tests/src/test/java/org/neo4j/graphalgo/algo/similarity/JaccardTest.java b/tests/src/test/java/org/neo4j/graphalgo/algo/similarity/JaccardTest.java index 000d15ac0..3e6e24b3a 100644 --- a/tests/src/test/java/org/neo4j/graphalgo/algo/similarity/JaccardTest.java +++ b/tests/src/test/java/org/neo4j/graphalgo/algo/similarity/JaccardTest.java @@ -164,7 +164,7 @@ public void jaccardSingleMultiThreadComparisionTopK() { @Test public void topNjaccardStreamTest() { - Result results = db.execute(STATEMENT_STREAM, map("config",map("top",2))); + Result results = db.execute(STATEMENT_STREAM, map("config",map("top",2,"similarityCutoff",-1.0))); assert01(results.next()); assert02(results.next()); assertFalse(results.hasNext()); @@ -172,7 +172,7 @@ public void topNjaccardStreamTest() { @Test public void jaccardStreamTest() { - Result results = db.execute(STATEMENT_STREAM, map("config",map("concurrency",1))); + Result results = db.execute(STATEMENT_STREAM, map("config",map("concurrency",1,"similarityCutoff",-1.0))); assertTrue(results.hasNext()); assert01(results.next()); assert02(results.next()); @@ -182,7 +182,7 @@ public void jaccardStreamTest() { @Test public void topKJaccardStreamTest() { - Map params = map("config", map( "concurrency", 1,"topK", 1)); + Map params = map("config", map( "concurrency", 1,"topK", 1,"similarityCutoff",-1.0)); System.out.println(db.execute(STATEMENT_STREAM, params).resultAsString()); Result results = db.execute(STATEMENT_STREAM, params); @@ -227,7 +227,7 @@ public void topK4jaccardStreamTest() { @Test public void topK3jaccardStreamTest() { - Map params = map("config", map("concurrency", 3, "topK", 3)); + Map params = map("config", map("concurrency", 3, "topK", 3, "similarityCutoff",-1.0)); System.out.println(db.execute(STATEMENT_STREAM, params).resultAsString()); diff --git a/tests/src/test/java/org/neo4j/graphalgo/algo/similarity/OverlapTest.java b/tests/src/test/java/org/neo4j/graphalgo/algo/similarity/OverlapTest.java index ead065d98..f48e7b566 100644 --- a/tests/src/test/java/org/neo4j/graphalgo/algo/similarity/OverlapTest.java +++ b/tests/src/test/java/org/neo4j/graphalgo/algo/similarity/OverlapTest.java @@ -174,7 +174,7 @@ public void topNoverlapStreamTest() { @Test public void overlapStreamTest() { - Result results = db.execute(STATEMENT_STREAM, map("config",map("concurrency",1))); + Result results = db.execute(STATEMENT_STREAM, map("config",map("concurrency",1,"similarityCutoff",-1.0))); assertTrue(results.hasNext()); assert01(results.next()); @@ -229,7 +229,7 @@ public void topK4overlapStreamTest() { @Test public void topK3overlapStreamTest() { - Map params = map("config", map("concurrency", 3, "topK", 3)); + Map params = map("config", map("concurrency", 3, "topK", 3, "similarityCutoff", -1.0)); System.out.println(db.execute(STATEMENT_STREAM, params).resultAsString());