1
0
mirror of synced 2025-12-19 09:57:42 -05:00

add spammy word list for survey (#51065)

This commit is contained in:
Rachael Sewell
2024-06-10 14:38:18 -07:00
committed by GitHub
parent 99be0b710b
commit d13a7eef53
3 changed files with 72 additions and 5 deletions

12
data/survey-words.yml Normal file
View File

@@ -0,0 +1,12 @@
words:
- Minecraft
- Skype
- Instagram
- Facebook
- Roblox
- robux
- Game
- Bank
- Goldy
- Hack
- Crypto

View File

@@ -1,3 +1,5 @@
import fs from 'fs'
import yaml from 'js-yaml'
import { cuss } from 'cuss'
import { cuss as cussPt } from 'cuss/pt'
import { cuss as cussFr } from 'cuss/fr'
@@ -19,12 +21,12 @@ export const SIGNAL_RATINGS = [
validator: (comment) => isContainingEmail(comment),
},
{
reduction: 0.1,
reduction: 1.0,
name: 'url-only',
validator: (comment) => isURL(comment),
},
{
reduction: 0.1,
reduction: 1.0,
name: 'numbers-only',
validator: (comment) => isNumbersOnly(comment),
},
@@ -58,6 +60,11 @@ export const SIGNAL_RATINGS = [
name: 'mostly-emoji',
validator: (comment) => isMostlyEmoji(comment),
},
{
reduction: 1.0,
name: 'spammy-words',
validator: (comment) => isSpammyWordList(comment),
},
]
export async function analyzeComment(text, language = 'en') {
@@ -108,8 +115,7 @@ function isAllUppercase(text) {
function isTooShort(text) {
const split = text.trim().split(/\s+/)
if (split.length <= 1) {
// return !isNumbersOnly(text) && !isURL(text) && !isEmailOnly(text) && !isAllUppercase(text)
if (split.length <= 3) {
return true
}
}
@@ -173,3 +179,13 @@ function splitWords(text) {
const segmentedText = segmenter.segment(text)
return [...segmentedText].filter((s) => s.isWordLike).map((s) => s.segment)
}
const surveyYaml = yaml.load(fs.readFileSync('data/survey-words.yml', 'utf8'))
const surveyWords = surveyYaml.words.map((word) => word.toLowerCase())
function isSpammyWordList(text) {
const words = text.toLowerCase().split(/(\s+|\\n+)/g)
// Currently, we're intentionally not checking for
// survey words that are substrings of a comment word.
return Boolean(words.some((word) => surveyWords.includes(word)))
}

View File

@@ -110,10 +110,24 @@ describe('analyzeComment', () => {
expect(signals.includes('too-short')).toBeTruthy()
expect(rating).toBeLessThan(1.0)
}
{
const { signals, rating } = await analyzeComment(' Oneword two words')
expect(signals.includes('too-short')).toBeTruthy()
expect(rating).toBeLessThan(1.0)
}
{
const { signals, rating } = await analyzeComment('A\nB')
expect(signals.includes('too-short')).toBeTruthy()
expect(rating).toBeLessThan(1.0)
}
// No
{
const { signals } = await analyzeComment('A\nB')
const { signals } = await analyzeComment('A\nB\nC\nD')
expect(signals.includes('too-short')).toBeFalsy()
}
{
const { signals } = await analyzeComment('One two three four ')
expect(signals.includes('too-short')).toBeFalsy()
}
})
@@ -179,4 +193,29 @@ describe('analyzeComment', () => {
expect(signals.includes('mostly-emoji')).toBeFalsy()
}
})
test('spammy-words', async () => {
// Yes
{
const { signals, rating } = await analyzeComment('Roblox free roblux')
expect(signals.includes('spammy-words')).toBeTruthy()
expect(rating).toBeLessThan(1.0)
}
{
const { signals, rating } = await analyzeComment('GOOGLE \n\nGAME')
expect(signals.includes('spammy-words')).toBeTruthy()
expect(rating).toBeLessThan(1.0)
}
// No
{
const { signals } = await analyzeComment('GitHub is great!')
expect(signals.includes('spammy-words')).toBeFalsy()
}
// No sub-string matches allowed
{
const { signals } = await analyzeComment('MinecraftFacebook')
expect(signals.includes('spammy-words')).toBeFalsy()
}
})
})