add spammy word list for survey (#51065)
This commit is contained in:
12
data/survey-words.yml
Normal file
12
data/survey-words.yml
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
words:
|
||||||
|
- Minecraft
|
||||||
|
- Skype
|
||||||
|
- Instagram
|
||||||
|
- Facebook
|
||||||
|
- Roblox
|
||||||
|
- robux
|
||||||
|
- Game
|
||||||
|
- Bank
|
||||||
|
- Goldy
|
||||||
|
- Hack
|
||||||
|
- Crypto
|
||||||
@@ -1,3 +1,5 @@
|
|||||||
|
import fs from 'fs'
|
||||||
|
import yaml from 'js-yaml'
|
||||||
import { cuss } from 'cuss'
|
import { cuss } from 'cuss'
|
||||||
import { cuss as cussPt } from 'cuss/pt'
|
import { cuss as cussPt } from 'cuss/pt'
|
||||||
import { cuss as cussFr } from 'cuss/fr'
|
import { cuss as cussFr } from 'cuss/fr'
|
||||||
@@ -19,12 +21,12 @@ export const SIGNAL_RATINGS = [
|
|||||||
validator: (comment) => isContainingEmail(comment),
|
validator: (comment) => isContainingEmail(comment),
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
reduction: 0.1,
|
reduction: 1.0,
|
||||||
name: 'url-only',
|
name: 'url-only',
|
||||||
validator: (comment) => isURL(comment),
|
validator: (comment) => isURL(comment),
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
reduction: 0.1,
|
reduction: 1.0,
|
||||||
name: 'numbers-only',
|
name: 'numbers-only',
|
||||||
validator: (comment) => isNumbersOnly(comment),
|
validator: (comment) => isNumbersOnly(comment),
|
||||||
},
|
},
|
||||||
@@ -58,6 +60,11 @@ export const SIGNAL_RATINGS = [
|
|||||||
name: 'mostly-emoji',
|
name: 'mostly-emoji',
|
||||||
validator: (comment) => isMostlyEmoji(comment),
|
validator: (comment) => isMostlyEmoji(comment),
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
reduction: 1.0,
|
||||||
|
name: 'spammy-words',
|
||||||
|
validator: (comment) => isSpammyWordList(comment),
|
||||||
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
export async function analyzeComment(text, language = 'en') {
|
export async function analyzeComment(text, language = 'en') {
|
||||||
@@ -108,8 +115,7 @@ function isAllUppercase(text) {
|
|||||||
|
|
||||||
function isTooShort(text) {
|
function isTooShort(text) {
|
||||||
const split = text.trim().split(/\s+/)
|
const split = text.trim().split(/\s+/)
|
||||||
if (split.length <= 1) {
|
if (split.length <= 3) {
|
||||||
// return !isNumbersOnly(text) && !isURL(text) && !isEmailOnly(text) && !isAllUppercase(text)
|
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -173,3 +179,13 @@ function splitWords(text) {
|
|||||||
const segmentedText = segmenter.segment(text)
|
const segmentedText = segmenter.segment(text)
|
||||||
return [...segmentedText].filter((s) => s.isWordLike).map((s) => s.segment)
|
return [...segmentedText].filter((s) => s.isWordLike).map((s) => s.segment)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const surveyYaml = yaml.load(fs.readFileSync('data/survey-words.yml', 'utf8'))
|
||||||
|
const surveyWords = surveyYaml.words.map((word) => word.toLowerCase())
|
||||||
|
|
||||||
|
function isSpammyWordList(text) {
|
||||||
|
const words = text.toLowerCase().split(/(\s+|\\n+)/g)
|
||||||
|
// Currently, we're intentionally not checking for
|
||||||
|
// survey words that are substrings of a comment word.
|
||||||
|
return Boolean(words.some((word) => surveyWords.includes(word)))
|
||||||
|
}
|
||||||
|
|||||||
@@ -110,10 +110,24 @@ describe('analyzeComment', () => {
|
|||||||
expect(signals.includes('too-short')).toBeTruthy()
|
expect(signals.includes('too-short')).toBeTruthy()
|
||||||
expect(rating).toBeLessThan(1.0)
|
expect(rating).toBeLessThan(1.0)
|
||||||
}
|
}
|
||||||
|
{
|
||||||
|
const { signals, rating } = await analyzeComment(' Oneword two words')
|
||||||
|
expect(signals.includes('too-short')).toBeTruthy()
|
||||||
|
expect(rating).toBeLessThan(1.0)
|
||||||
|
}
|
||||||
|
{
|
||||||
|
const { signals, rating } = await analyzeComment('A\nB')
|
||||||
|
expect(signals.includes('too-short')).toBeTruthy()
|
||||||
|
expect(rating).toBeLessThan(1.0)
|
||||||
|
}
|
||||||
|
|
||||||
// No
|
// No
|
||||||
{
|
{
|
||||||
const { signals } = await analyzeComment('A\nB')
|
const { signals } = await analyzeComment('A\nB\nC\nD')
|
||||||
|
expect(signals.includes('too-short')).toBeFalsy()
|
||||||
|
}
|
||||||
|
{
|
||||||
|
const { signals } = await analyzeComment('One two three four ')
|
||||||
expect(signals.includes('too-short')).toBeFalsy()
|
expect(signals.includes('too-short')).toBeFalsy()
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
@@ -179,4 +193,29 @@ describe('analyzeComment', () => {
|
|||||||
expect(signals.includes('mostly-emoji')).toBeFalsy()
|
expect(signals.includes('mostly-emoji')).toBeFalsy()
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
|
test('spammy-words', async () => {
|
||||||
|
// Yes
|
||||||
|
{
|
||||||
|
const { signals, rating } = await analyzeComment('Roblox free roblux')
|
||||||
|
expect(signals.includes('spammy-words')).toBeTruthy()
|
||||||
|
expect(rating).toBeLessThan(1.0)
|
||||||
|
}
|
||||||
|
{
|
||||||
|
const { signals, rating } = await analyzeComment('GOOGLE \n\nGAME')
|
||||||
|
expect(signals.includes('spammy-words')).toBeTruthy()
|
||||||
|
expect(rating).toBeLessThan(1.0)
|
||||||
|
}
|
||||||
|
|
||||||
|
// No
|
||||||
|
{
|
||||||
|
const { signals } = await analyzeComment('GitHub is great!')
|
||||||
|
expect(signals.includes('spammy-words')).toBeFalsy()
|
||||||
|
}
|
||||||
|
// No sub-string matches allowed
|
||||||
|
{
|
||||||
|
const { signals } = await analyzeComment('MinecraftFacebook')
|
||||||
|
expect(signals.includes('spammy-words')).toBeFalsy()
|
||||||
|
}
|
||||||
|
})
|
||||||
})
|
})
|
||||||
|
|||||||
Reference in New Issue
Block a user