Merge pull request #4 from alexdeploy/moderation

Added prompt simple moderation for violent content.
alexdeploy · May 10, 2023 · 84d52e5 · 84d52e5
2 parents 6693a5a + 3c2a29c
commit 84d52e5
Show file tree

Hide file tree

Showing 4 changed files with 110 additions and 3 deletions.
diff --git a/client/__tests__/Security.test.js b/client/__tests__/Security.test.js
@@ -0,0 +1,47 @@
+const { describe, it, expect } = require('@jest/globals');
+const { moderationCheck } = require('../src/utils/security');
+
+describe('Security check:', () => {
+
+    const response = {
+        data: {
+            "id": "modr-XXXXX",
+            "model": "text-moderation-001",
+            "results": [
+              {
+                "categories": {
+                  "hate": false,
+                  "hate/threatening": false,
+                  "self-harm": false,
+                  "sexual": false,
+                  "sexual/minors": false,
+                  "violence": false,
+                  "violence/graphic": false
+                },
+                "category_scores": {
+                  "hate": 0.18805529177188873,
+                  "hate/threatening": 0.0001250059431185946,
+                  "self-harm": 0.0003706029092427343,
+                  "sexual": 0.0008735615410842001,
+                  "sexual/minors": 0.0007470346172340214,
+                  "violence": 0.0041268812492489815,
+                  "violence/graphic": 0.00023186142789199948
+                },
+                "flagged": false
+              }
+            ]
+          }
+        }
+
+    it('should return true only if all categories are false', async () => {
+
+        const result = await moderationCheck(response.data);
+        expect(result).toBe(true);
+    });
+
+    it('should return false if any category is true', async () => {
+        response.data.results[0].categories.hate = true;
+        const result = await moderationCheck(response.data);
+        expect(result).toBe(false);
+    });
+});
diff --git a/client/src/events/messageCreate.js b/client/src/events/messageCreate.js
@@ -1,4 +1,5 @@
-const { chat } = require('../utils/openai');
+const { chat, moderation  } = require('../utils/openai');
+const { moderationCheck } = require('../utils/security');
 
 module.exports = {
 	name: 'messageCreate',
@@ -41,6 +42,20 @@ module.exports = {
              * @see OpenAI Safety best Practices: https://platform.openai.com/docs/guides/safety-best-practices
              */
 
+
+            /**
+             * MODERATE
+             */
+            const classification = await moderation(prompt);
+
+            const moderationChecked = await moderationCheck(classification.data);
+
+           // If the message violates the Content Policy, return a warning message.
+            if(!moderationChecked){
+                await interactionReply.edit("Your message violates OpenAI's Content Policy. Please, try again.");
+                return;
+            }
+
             // Get the response from the chatGPT-3
             const response = await chat(prompt);
 

diff --git a/client/src/utils/openai.js b/client/src/utils/openai.js
@@ -27,9 +27,15 @@ const model = {
     BABBAGE: "",
     CURIE: "curie",
     DAVINCI: "text-davinci-003",
-    GPT3_TURBO: "gpt-3.5-turbo"
+    GPT3_TURBO: "gpt-3.5-turbo",
+    MODERATION: {
+        STABLE: "text-moderation-stable",
+        LATEST: "text-moderation-latest"
+    }
 };
 
+
+
 const role = {
     USER: "user",
     SYSTEM: "system",
@@ -117,7 +123,22 @@ const createChatCompletion = async (prompt) => {
     }
 }
 
+/**
+ * CREATE MODERATION
+ * * Classifies if text violates OpenAI's Content Policy.
+ * @see Documentation https://platform.openai.com/docs/api-reference/moderations/create
+ */
+const createModeration = async (prompt) => {
+    const response = await openai.createModeration({
+        model: model.MODERATION.LATEST,
+        input: prompt,
+    });
+
+    return response;
+}
+
 module.exports = {
     send: createCompletion,
-    chat: createChatCompletion
+    chat: createChatCompletion,
+    moderation: createModeration
 }
diff --git a/client/src/utils/security.js b/client/src/utils/security.js
@@ -0,0 +1,24 @@
+/**
+ * MODERATION RESPONSE CHECK
+ * * Checks for any content policy violations in the response of OpenAI's Content Policy moderation.
+ * @param {*} moderation is an response object of OpenAI's Content Policy moderation.
+ * @see Moderation https://platform.openai.com/docs/guides/moderation/moderation
+ * @returns true if the moderation is safe, false if it violates the Content Policy.
+ */
+
+const moderationCheck = async (moderation) => {
+
+    // moderation results
+    const category_scores = moderation.results[0].category_scores;
+    const categories = moderation.results[0].categories;
+
+    const someIsTrue = Object.values(categories).some(valor => valor === true);
+
+    if(someIsTrue) return false;
+
+    return true;
+}
+
+module.exports = {
+    moderationCheck
+}