add spammy word list for survey (#51065)
This commit is contained in:
12
data/survey-words.yml
Normal file
12
data/survey-words.yml
Normal file
@@ -0,0 +1,12 @@
|
||||
words:
|
||||
- Minecraft
|
||||
- Skype
|
||||
- Instagram
|
||||
- Facebook
|
||||
- Roblox
|
||||
- robux
|
||||
- Game
|
||||
- Bank
|
||||
- Goldy
|
||||
- Hack
|
||||
- Crypto
|
||||
@@ -1,3 +1,5 @@
|
||||
import fs from 'fs'
|
||||
import yaml from 'js-yaml'
|
||||
import { cuss } from 'cuss'
|
||||
import { cuss as cussPt } from 'cuss/pt'
|
||||
import { cuss as cussFr } from 'cuss/fr'
|
||||
@@ -19,12 +21,12 @@ export const SIGNAL_RATINGS = [
|
||||
validator: (comment) => isContainingEmail(comment),
|
||||
},
|
||||
{
|
||||
reduction: 0.1,
|
||||
reduction: 1.0,
|
||||
name: 'url-only',
|
||||
validator: (comment) => isURL(comment),
|
||||
},
|
||||
{
|
||||
reduction: 0.1,
|
||||
reduction: 1.0,
|
||||
name: 'numbers-only',
|
||||
validator: (comment) => isNumbersOnly(comment),
|
||||
},
|
||||
@@ -58,6 +60,11 @@ export const SIGNAL_RATINGS = [
|
||||
name: 'mostly-emoji',
|
||||
validator: (comment) => isMostlyEmoji(comment),
|
||||
},
|
||||
{
|
||||
reduction: 1.0,
|
||||
name: 'spammy-words',
|
||||
validator: (comment) => isSpammyWordList(comment),
|
||||
},
|
||||
]
|
||||
|
||||
export async function analyzeComment(text, language = 'en') {
|
||||
@@ -108,8 +115,7 @@ function isAllUppercase(text) {
|
||||
|
||||
function isTooShort(text) {
|
||||
const split = text.trim().split(/\s+/)
|
||||
if (split.length <= 1) {
|
||||
// return !isNumbersOnly(text) && !isURL(text) && !isEmailOnly(text) && !isAllUppercase(text)
|
||||
if (split.length <= 3) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
@@ -173,3 +179,13 @@ function splitWords(text) {
|
||||
const segmentedText = segmenter.segment(text)
|
||||
return [...segmentedText].filter((s) => s.isWordLike).map((s) => s.segment)
|
||||
}
|
||||
|
||||
const surveyYaml = yaml.load(fs.readFileSync('data/survey-words.yml', 'utf8'))
|
||||
const surveyWords = surveyYaml.words.map((word) => word.toLowerCase())
|
||||
|
||||
function isSpammyWordList(text) {
|
||||
const words = text.toLowerCase().split(/(\s+|\\n+)/g)
|
||||
// Currently, we're intentionally not checking for
|
||||
// survey words that are substrings of a comment word.
|
||||
return Boolean(words.some((word) => surveyWords.includes(word)))
|
||||
}
|
||||
|
||||
@@ -110,10 +110,24 @@ describe('analyzeComment', () => {
|
||||
expect(signals.includes('too-short')).toBeTruthy()
|
||||
expect(rating).toBeLessThan(1.0)
|
||||
}
|
||||
{
|
||||
const { signals, rating } = await analyzeComment(' Oneword two words')
|
||||
expect(signals.includes('too-short')).toBeTruthy()
|
||||
expect(rating).toBeLessThan(1.0)
|
||||
}
|
||||
{
|
||||
const { signals, rating } = await analyzeComment('A\nB')
|
||||
expect(signals.includes('too-short')).toBeTruthy()
|
||||
expect(rating).toBeLessThan(1.0)
|
||||
}
|
||||
|
||||
// No
|
||||
{
|
||||
const { signals } = await analyzeComment('A\nB')
|
||||
const { signals } = await analyzeComment('A\nB\nC\nD')
|
||||
expect(signals.includes('too-short')).toBeFalsy()
|
||||
}
|
||||
{
|
||||
const { signals } = await analyzeComment('One two three four ')
|
||||
expect(signals.includes('too-short')).toBeFalsy()
|
||||
}
|
||||
})
|
||||
@@ -179,4 +193,29 @@ describe('analyzeComment', () => {
|
||||
expect(signals.includes('mostly-emoji')).toBeFalsy()
|
||||
}
|
||||
})
|
||||
|
||||
test('spammy-words', async () => {
|
||||
// Yes
|
||||
{
|
||||
const { signals, rating } = await analyzeComment('Roblox free roblux')
|
||||
expect(signals.includes('spammy-words')).toBeTruthy()
|
||||
expect(rating).toBeLessThan(1.0)
|
||||
}
|
||||
{
|
||||
const { signals, rating } = await analyzeComment('GOOGLE \n\nGAME')
|
||||
expect(signals.includes('spammy-words')).toBeTruthy()
|
||||
expect(rating).toBeLessThan(1.0)
|
||||
}
|
||||
|
||||
// No
|
||||
{
|
||||
const { signals } = await analyzeComment('GitHub is great!')
|
||||
expect(signals.includes('spammy-words')).toBeFalsy()
|
||||
}
|
||||
// No sub-string matches allowed
|
||||
{
|
||||
const { signals } = await analyzeComment('MinecraftFacebook')
|
||||
expect(signals.includes('spammy-words')).toBeFalsy()
|
||||
}
|
||||
})
|
||||
})
|
||||
|
||||
Reference in New Issue
Block a user