add spammy word list for survey (#51065)

2025-12-19 18:10:59 -05:00 · 2024-06-10 14:38:18 -07:00
parent 99be0b710b
commit d13a7eef53
3 changed files with 72 additions and 5 deletions
--- a/data/survey-words.yml
+++ b/data/survey-words.yml
@@ -0,0 +1,12 @@
 words:
  - Minecraft
  - Skype
  - Instagram
  - Facebook
  - Roblox
  - robux
  - Game
  - Bank
  - Goldy
  - Hack
  - Crypto
--- a/src/events/analyze-comment.js
+++ b/src/events/analyze-comment.js
@@ -1,3 +1,5 @@
 import fs from 'fs'
 import yaml from 'js-yaml'
 import { cuss } from 'cuss'
 import { cuss as cussPt } from 'cuss/pt'
 import { cuss as cussFr } from 'cuss/fr'
@@ -19,12 +21,12 @@ export const SIGNAL_RATINGS = [
    validator: (comment) => isContainingEmail(comment),
  },
  {
-    reduction: 0.1,
+    reduction: 1.0,
    name: 'url-only',
    validator: (comment) => isURL(comment),
  },
  {
-    reduction: 0.1,
+    reduction: 1.0,
    name: 'numbers-only',
    validator: (comment) => isNumbersOnly(comment),
  },
@@ -58,6 +60,11 @@ export const SIGNAL_RATINGS = [
    name: 'mostly-emoji',
    validator: (comment) => isMostlyEmoji(comment),
  },
  {
    reduction: 1.0,
    name: 'spammy-words',
    validator: (comment) => isSpammyWordList(comment),
  },
 ]
 export async function analyzeComment(text, language = 'en') {
@@ -108,8 +115,7 @@ function isAllUppercase(text) {
 function isTooShort(text) {
  const split = text.trim().split(/\s+/)
-  if (split.length <= 1) {
+  if (split.length <= 3) {
    // return !isNumbersOnly(text) && !isURL(text) && !isEmailOnly(text) && !isAllUppercase(text)
    return true
  }
 }
@@ -173,3 +179,13 @@ function splitWords(text) {
  const segmentedText = segmenter.segment(text)
  return [...segmentedText].filter((s) => s.isWordLike).map((s) => s.segment)
 }
 const surveyYaml = yaml.load(fs.readFileSync('data/survey-words.yml', 'utf8'))
 const surveyWords = surveyYaml.words.map((word) => word.toLowerCase())
 function isSpammyWordList(text) {
  const words = text.toLowerCase().split(/(\s+|\\n+)/g)
  // Currently, we're intentionally not checking for
  // survey words that are substrings of a comment word.
  return Boolean(words.some((word) => surveyWords.includes(word)))
 }
--- a/src/events/tests/analyze-comments.js
+++ b/src/events/tests/analyze-comments.js
@@ -110,10 +110,24 @@ describe('analyzeComment', () => {
      expect(signals.includes('too-short')).toBeTruthy()
      expect(rating).toBeLessThan(1.0)
    }
    {
      const { signals, rating } = await analyzeComment(' Oneword two words')
      expect(signals.includes('too-short')).toBeTruthy()
      expect(rating).toBeLessThan(1.0)
    }
    {
      const { signals, rating } = await analyzeComment('A\nB')
      expect(signals.includes('too-short')).toBeTruthy()
      expect(rating).toBeLessThan(1.0)
    }
    // No
    {
-      const { signals } = await analyzeComment('A\nB')
+      const { signals } = await analyzeComment('A\nB\nC\nD')
      expect(signals.includes('too-short')).toBeFalsy()
    }
    {
      const { signals } = await analyzeComment('One two three four ')
      expect(signals.includes('too-short')).toBeFalsy()
    }
  })
@@ -179,4 +193,29 @@ describe('analyzeComment', () => {
      expect(signals.includes('mostly-emoji')).toBeFalsy()
    }
  })
  test('spammy-words', async () => {
    // Yes
    {
      const { signals, rating } = await analyzeComment('Roblox free roblux')
      expect(signals.includes('spammy-words')).toBeTruthy()
      expect(rating).toBeLessThan(1.0)
    }
    {
      const { signals, rating } = await analyzeComment('GOOGLE \n\nGAME')
      expect(signals.includes('spammy-words')).toBeTruthy()
      expect(rating).toBeLessThan(1.0)
    }
    // No
    {
      const { signals } = await analyzeComment('GitHub is great!')
      expect(signals.includes('spammy-words')).toBeFalsy()
    }
    // No sub-string matches allowed
    {
      const { signals } = await analyzeComment('MinecraftFacebook')
      expect(signals.includes('spammy-words')).toBeFalsy()
    }
  })
 })