package goaway import ( "strings" "unicode" "golang.org/x/text/runes" "golang.org/x/text/transform" "golang.org/x/text/unicode/norm" ) const ( space = " " firstRuneSupported = ' ' lastRuneSupported = '~' ) var ( defaultProfanityDetector *ProfanityDetector ) // ProfanityDetector contains the dictionaries as well as the configuration // for determining how profanity detection is handled type ProfanityDetector struct { sanitizeSpecialCharacters bool // Whether to replace characters with the value ' ' in characterReplacements sanitizeLeetSpeak bool // Whether to replace characters with a non-' ' value in characterReplacements sanitizeAccents bool sanitizeSpaces bool profanities []string falseNegatives []string falsePositives []string characterReplacements map[rune]rune } // NewProfanityDetector creates a new ProfanityDetector func NewProfanityDetector() *ProfanityDetector { return &ProfanityDetector{ sanitizeSpecialCharacters: true, sanitizeLeetSpeak: true, sanitizeAccents: true, sanitizeSpaces: true, profanities: DefaultProfanities, falsePositives: DefaultFalsePositives, falseNegatives: DefaultFalseNegatives, characterReplacements: DefaultCharacterReplacements, } } // WithSanitizeLeetSpeak allows configuring whether the sanitization process should also take into account leetspeak // // Leetspeak characters are characters to be replaced by non-' ' values in the characterReplacements map. // For instance, '4' is replaced by 'a' and '3' is replaced by 'e', which means that "4sshol3" would be // sanitized to "asshole", which would be detected as a profanity. // // By default, this is set to true. func (g *ProfanityDetector) WithSanitizeLeetSpeak(sanitize bool) *ProfanityDetector { g.sanitizeLeetSpeak = sanitize return g.buildCharacterReplacements() } // WithSanitizeSpecialCharacters allows configuring whether the sanitization process should also take into account // special characters. // // Special characters are characters that are part of the characterReplacements map (DefaultCharacterReplacements by // default) and are to be removed during the sanitization step. // // For instance, "fu_ck" would be sanitized to "fuck", which would be detected as a profanity. // // By default, this is set to true. func (g *ProfanityDetector) WithSanitizeSpecialCharacters(sanitize bool) *ProfanityDetector { g.sanitizeSpecialCharacters = sanitize return g.buildCharacterReplacements() } // WithSanitizeAccents allows configuring of whether the sanitization process should also take into account accents. // By default, this is set to true, but since this adds a bit of overhead, you may disable it if your use case // is time-sensitive or if the input doesn't involve accents (i.e. if the input can never contain special characters) func (g *ProfanityDetector) WithSanitizeAccents(sanitize bool) *ProfanityDetector { g.sanitizeAccents = sanitize return g } // WithSanitizeSpaces allows configuring whether the sanitization process should also take into account spaces func (g *ProfanityDetector) WithSanitizeSpaces(sanitize bool) *ProfanityDetector { g.sanitizeSpaces = sanitize return g } // WithCustomDictionary allows configuring whether the sanitization process should also take into account // custom profanities, false positives and false negatives dictionaries. // All dictionaries are expected to be lowercased. func (g *ProfanityDetector) WithCustomDictionary(profanities, falsePositives, falseNegatives []string) *ProfanityDetector { g.profanities = profanities g.falsePositives = falsePositives g.falseNegatives = falseNegatives return g } // WithCustomCharacterReplacements allows configuring characters that to be replaced by other characters. // // Note that all entries that have the value ' ' are considered as special characters while all entries with a value // that is not ' ' are considered as leet speak. // // Defaults to DefaultCharacterReplacements func (g *ProfanityDetector) WithCustomCharacterReplacements(characterReplacements map[rune]rune) *ProfanityDetector { g.characterReplacements = characterReplacements return g } // IsProfane takes in a string (word or sentence) and look for profanities. // Returns a boolean func (g *ProfanityDetector) IsProfane(s string) bool { return len(g.ExtractProfanity(s)) > 0 } // ExtractProfanity takes in a string (word or sentence) and look for profanities. // Returns the first profanity found, or an empty string if none are found func (g *ProfanityDetector) ExtractProfanity(s string) string { s, _ = g.sanitize(s, false) // Check for false negatives for _, word := range g.falseNegatives { if match := strings.Contains(s, word); match { return word } } // Remove false positives for _, word := range g.falsePositives { s = strings.Replace(s, word, "", -1) } // Check for profanities for _, word := range g.profanities { if match := strings.Contains(s, word); match { return word } } return "" } func (g *ProfanityDetector) indexToRune(s string, index int) int { count := 0 for i := range s { if i == index { break } if i < index { count++ } } return count } func (g *ProfanityDetector) Censor(s string) string { censored := []rune(s) var originalIndexes []int s, originalIndexes = g.sanitize(s, true) runeWordLength := 0 g.checkProfanity(&s, &originalIndexes, &censored, g.falseNegatives, &runeWordLength) g.removeFalsePositives(&s, &originalIndexes, &runeWordLength) g.checkProfanity(&s, &originalIndexes, &censored, g.profanities, &runeWordLength) return string(censored) } func (g *ProfanityDetector) checkProfanity(s *string, originalIndexes *[]int, censored *[]rune, wordList []string, runeWordLength *int) { for _, word := range wordList { currentIndex := 0 *runeWordLength = len([]rune(word)) for currentIndex != -1 { if foundIndex := strings.Index((*s)[currentIndex:], word); foundIndex != -1 { for i := 0; i < *runeWordLength; i++ { runeIndex := g.indexToRune(*s, currentIndex+foundIndex) + i if runeIndex < len(*originalIndexes) { (*censored)[(*originalIndexes)[runeIndex]] = '*' } } currentIndex += foundIndex + len([]byte(word)) } else { break } } } } func (g *ProfanityDetector) removeFalsePositives(s *string, originalIndexes *[]int, runeWordLength *int) { for _, word := range g.falsePositives { currentIndex := 0 *runeWordLength = len([]rune(word)) for currentIndex != -1 { if foundIndex := strings.Index((*s)[currentIndex:], word); foundIndex != -1 { foundRuneIndex := g.indexToRune(*s, foundIndex) *originalIndexes = append((*originalIndexes)[:foundRuneIndex], (*originalIndexes)[foundRuneIndex+*runeWordLength:]...) currentIndex += foundIndex + len([]byte(word)) } else { break } } *s = strings.Replace(*s, word, "", -1) } } func (g ProfanityDetector) sanitize(s string, rememberOriginalIndexes bool) (string, []int) { s = strings.ToLower(s) if g.sanitizeLeetSpeak && !rememberOriginalIndexes && g.sanitizeSpecialCharacters { s = strings.ReplaceAll(s, "()", "o") } sb := strings.Builder{} for _, char := range s { if replacement, found := g.characterReplacements[char]; found { if g.sanitizeSpecialCharacters && replacement == ' ' { // If the replacement is a space, and we're sanitizing special characters speak, we replace. sb.WriteRune(replacement) continue } else if g.sanitizeLeetSpeak && replacement != ' ' { // If the replacement isn't a space, and we're sanitizing leet speak, we replace. sb.WriteRune(replacement) continue } } sb.WriteRune(char) } s = sb.String() if g.sanitizeAccents { s = removeAccents(s) } var originalIndexes []int if rememberOriginalIndexes { for i, c := range []rune(s) { // If spaces aren't being sanitized, appending to the original indices prevents off-by-one errors later on. if c != ' ' || !g.sanitizeSpaces { originalIndexes = append(originalIndexes, i) } } } if g.sanitizeSpaces { s = strings.Replace(s, space, "", -1) } return s, originalIndexes } // removeAccents strips all accents from characters. // Only called if ProfanityDetector.removeAccents is set to true func removeAccents(s string) string { removeAccentsTransformer := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC) for _, character := range s { // If there's a character outside the range of supported runes, there might be some accented words if character < firstRuneSupported || character > lastRuneSupported { s, _, _ = transform.String(removeAccentsTransformer, s) break } } return s } // buildCharacterReplacements builds characterReplacements if WithSanitizeLeetSpeak or WithSanitizeSpecialCharacters is // called. // // If this is not called, DefaultCharacterReplacements func (g *ProfanityDetector) buildCharacterReplacements() *ProfanityDetector { g.characterReplacements = make(map[rune]rune) if g.sanitizeSpecialCharacters { g.characterReplacements['-'] = ' ' g.characterReplacements['_'] = ' ' g.characterReplacements['|'] = ' ' g.characterReplacements['.'] = ' ' g.characterReplacements[','] = ' ' g.characterReplacements['('] = ' ' g.characterReplacements[')'] = ' ' g.characterReplacements['<'] = ' ' g.characterReplacements['>'] = ' ' g.characterReplacements['"'] = ' ' g.characterReplacements['`'] = ' ' g.characterReplacements['~'] = ' ' g.characterReplacements['*'] = ' ' g.characterReplacements['&'] = ' ' g.characterReplacements['%'] = ' ' g.characterReplacements['$'] = ' ' g.characterReplacements['#'] = ' ' g.characterReplacements['@'] = ' ' g.characterReplacements['!'] = ' ' g.characterReplacements['?'] = ' ' g.characterReplacements['+'] = ' ' } if g.sanitizeLeetSpeak { g.characterReplacements['4'] = 'a' g.characterReplacements['$'] = 's' g.characterReplacements['!'] = 'i' g.characterReplacements['+'] = 't' g.characterReplacements['#'] = 'h' g.characterReplacements['@'] = 'a' g.characterReplacements['0'] = 'o' g.characterReplacements['1'] = 'i' g.characterReplacements['7'] = 'l' g.characterReplacements['3'] = 'e' g.characterReplacements['5'] = 's' g.characterReplacements['<'] = 'c' } return g } // IsProfane checks whether there are any profanities in a given string (word or sentence). // // Uses the default ProfanityDetector func IsProfane(s string) bool { if defaultProfanityDetector == nil { defaultProfanityDetector = NewProfanityDetector() } return defaultProfanityDetector.IsProfane(s) } // ExtractProfanity takes in a string (word or sentence) and look for profanities. // Returns the first profanity found, or an empty string if none are found // // Uses the default ProfanityDetector func ExtractProfanity(s string) string { if defaultProfanityDetector == nil { defaultProfanityDetector = NewProfanityDetector() } return defaultProfanityDetector.ExtractProfanity(s) } // Censor takes in a string (word or sentence) and tries to censor all profanities found. // // Uses the default ProfanityDetector func Censor(s string) string { if defaultProfanityDetector == nil { defaultProfanityDetector = NewProfanityDetector() } return defaultProfanityDetector.Censor(s) }