add spammy word list for survey (#51065)

2025-12-19 09:57:42 -05:00 · 2024-06-10 14:38:18 -07:00
parent 99be0b710b
commit d13a7eef53
3 changed files with 72 additions and 5 deletions
--- a/data/survey-words.yml
+++ b/data/survey-words.yml
@@ -0,0 +1,12 @@
+words:
+  - Minecraft
+  - Skype
+  - Instagram
+  - Facebook
+  - Roblox
+  - robux
+  - Game
+  - Bank
+  - Goldy
+  - Hack
+  - Crypto
--- a/src/events/analyze-comment.js
+++ b/src/events/analyze-comment.js
@@ -1,3 +1,5 @@
+import fs from 'fs'
+import yaml from 'js-yaml'
 import { cuss } from 'cuss'
 import { cuss as cussPt } from 'cuss/pt'
 import { cuss as cussFr } from 'cuss/fr'
@@ -19,12 +21,12 @@ export const SIGNAL_RATINGS = [
    validator: (comment) => isContainingEmail(comment),
  },
  {
-    reduction: 0.1,
+    reduction: 1.0,
    name: 'url-only',
    validator: (comment) => isURL(comment),
  },
  {
-    reduction: 0.1,
+    reduction: 1.0,
    name: 'numbers-only',
    validator: (comment) => isNumbersOnly(comment),
  },
@@ -58,6 +60,11 @@ export const SIGNAL_RATINGS = [
    name: 'mostly-emoji',
    validator: (comment) => isMostlyEmoji(comment),
  },
+  {
+    reduction: 1.0,
+    name: 'spammy-words',
+    validator: (comment) => isSpammyWordList(comment),
+  },
 ]

 export async function analyzeComment(text, language = 'en') {
@@ -108,8 +115,7 @@ function isAllUppercase(text) {

 function isTooShort(text) {
  const split = text.trim().split(/\s+/)
-  if (split.length <= 1) {
-    // return !isNumbersOnly(text) && !isURL(text) && !isEmailOnly(text) && !isAllUppercase(text)
+  if (split.length <= 3) {
    return true
  }
 }
@@ -173,3 +179,13 @@ function splitWords(text) {
  const segmentedText = segmenter.segment(text)
  return [...segmentedText].filter((s) => s.isWordLike).map((s) => s.segment)
 }
+
+const surveyYaml = yaml.load(fs.readFileSync('data/survey-words.yml', 'utf8'))
+const surveyWords = surveyYaml.words.map((word) => word.toLowerCase())
+
+function isSpammyWordList(text) {
+  const words = text.toLowerCase().split(/(\s+|\\n+)/g)
+  // Currently, we're intentionally not checking for
+  // survey words that are substrings of a comment word.
+  return Boolean(words.some((word) => surveyWords.includes(word)))
+}
--- a/src/events/tests/analyze-comments.js
+++ b/src/events/tests/analyze-comments.js
@@ -110,10 +110,24 @@ describe('analyzeComment', () => {
      expect(signals.includes('too-short')).toBeTruthy()
      expect(rating).toBeLessThan(1.0)
    }
+    {
+      const { signals, rating } = await analyzeComment(' Oneword two words')
+      expect(signals.includes('too-short')).toBeTruthy()
+      expect(rating).toBeLessThan(1.0)
+    }
+    {
+      const { signals, rating } = await analyzeComment('A\nB')
+      expect(signals.includes('too-short')).toBeTruthy()
+      expect(rating).toBeLessThan(1.0)
+    }

    // No
    {
-      const { signals } = await analyzeComment('A\nB')
+      const { signals } = await analyzeComment('A\nB\nC\nD')
+      expect(signals.includes('too-short')).toBeFalsy()
+    }
+    {
+      const { signals } = await analyzeComment('One two three four ')
      expect(signals.includes('too-short')).toBeFalsy()
    }
  })
@@ -179,4 +193,29 @@ describe('analyzeComment', () => {
      expect(signals.includes('mostly-emoji')).toBeFalsy()
    }
  })
+
+  test('spammy-words', async () => {
+    // Yes
+    {
+      const { signals, rating } = await analyzeComment('Roblox free roblux')
+      expect(signals.includes('spammy-words')).toBeTruthy()
+      expect(rating).toBeLessThan(1.0)
+    }
+    {
+      const { signals, rating } = await analyzeComment('GOOGLE \n\nGAME')
+      expect(signals.includes('spammy-words')).toBeTruthy()
+      expect(rating).toBeLessThan(1.0)
+    }
+
+    // No
+    {
+      const { signals } = await analyzeComment('GitHub is great!')
+      expect(signals.includes('spammy-words')).toBeFalsy()
+    }
+    // No sub-string matches allowed
+    {
+      const { signals } = await analyzeComment('MinecraftFacebook')
+      expect(signals.includes('spammy-words')).toBeFalsy()
+    }
+  })
 })