Pau*_*ite 32 python nlp profanity
像/sf/ask/106515251/,但对于Python - 我正在寻找库,我可以在本地运行和控制自己,而不是Web服务.
(虽然听到你对亵渎过滤的原则的基本反对意见总是很好,但我并不是在这里专门寻找它们.我知道亵渎过滤不能发现所说的每一个有害的事情.我知道在大计划中咒骂事情,并不是一个特别大的问题.我知道你需要一些人力投入来处理内容问题.我只想找到一个好的图书馆,看看我能用它做什么.)
leo*_*luk 41
我没有找到任何Python亵渎图书馆,所以我自己做了一个.
filterlist与禁用词匹配的正则表达式列表.请不要使用\b,它将被插入取决于inside_words.
例:
['bad', 'un\w+']
ignore_case默认: True
不言自明.
replacements默认: "$@%-?!"
带有字符的字符串,将从中随机生成替换字符串.
示例:"%&$?!"或"-"等
complete默认: True
控制是否将替换整个字符串或是否保留第一个和最后一个字符.
inside_words默认: False
控制是否在其他单词中搜索单词.禁用此功能
(最后的例子)
"""
Module that provides a class that filters profanities
"""
__author__ = "leoluk"
__version__ = '0.0.1'
import random
import re
class ProfanitiesFilter(object):
def __init__(self, filterlist, ignore_case=True, replacements="$@%-?!",
complete=True, inside_words=False):
"""
Inits the profanity filter.
filterlist -- a list of regular expressions that
matches words that are forbidden
ignore_case -- ignore capitalization
replacements -- string with characters to replace the forbidden word
complete -- completely remove the word or keep the first and last char?
inside_words -- search inside other words?
"""
self.badwords = filterlist
self.ignore_case = ignore_case
self.replacements = replacements
self.complete = complete
self.inside_words = inside_words
def _make_clean_word(self, length):
"""
Generates a random replacement string of a given length
using the chars in self.replacements.
"""
return ''.join([random.choice(self.replacements) for i in
range(length)])
def __replacer(self, match):
value = match.group()
if self.complete:
return self._make_clean_word(len(value))
else:
return value[0]+self._make_clean_word(len(value)-2)+value[-1]
def clean(self, text):
"""Cleans a string from profanity."""
regexp_insidewords = {
True: r'(%s)',
False: r'\b(%s)\b',
}
regexp = (regexp_insidewords[self.inside_words] %
'|'.join(self.badwords))
r = re.compile(regexp, re.IGNORECASE if self.ignore_case else 0)
return r.sub(self.__replacer, text)
if __name__ == '__main__':
f = ProfanitiesFilter(['bad', 'un\w+'], replacements="-")
example = "I am doing bad ungood badlike things."
print f.clean(example)
# Returns "I am doing --- ------ badlike things."
f.inside_words = True
print f.clean(example)
# Returns "I am doing --- ------ ---like things."
f.complete = False
print f.clean(example)
# Returns "I am doing b-d u----d b-dlike things."
Run Code Online (Sandbox Code Playgroud)
小智 20
arrBad = [
'2g1c',
'2 girls 1 cup',
'acrotomophilia',
'anal',
'anilingus',
'anus',
'arsehole',
'ass',
'asshole',
'assmunch',
'auto erotic',
'autoerotic',
'babeland',
'baby batter',
'ball gag',
'ball gravy',
'ball kicking',
'ball licking',
'ball sack',
'ball sucking',
'bangbros',
'bareback',
'barely legal',
'barenaked',
'bastardo',
'bastinado',
'bbw',
'bdsm',
'beaver cleaver',
'beaver lips',
'bestiality',
'bi curious',
'big black',
'big breasts',
'big knockers',
'big tits',
'bimbos',
'birdlock',
'bitch',
'black cock',
'blonde action',
'blonde on blonde action',
'blow j',
'blow your l',
'blue waffle',
'blumpkin',
'bollocks',
'bondage',
'boner',
'boob',
'boobs',
'booty call',
'brown showers',
'brunette action',
'bukkake',
'bulldyke',
'bullet vibe',
'bung hole',
'bunghole',
'busty',
'butt',
'buttcheeks',
'butthole',
'camel toe',
'camgirl',
'camslut',
'camwhore',
'carpet muncher',
'carpetmuncher',
'chocolate rosebuds',
'circlejerk',
'cleveland steamer',
'clit',
'clitoris',
'clover clamps',
'clusterfuck',
'cock',
'cocks',
'coprolagnia',
'coprophilia',
'cornhole',
'cum',
'cumming',
'cunnilingus',
'cunt',
'darkie',
'date rape',
'daterape',
'deep throat',
'deepthroat',
'dick',
'dildo',
'dirty pillows',
'dirty sanchez',
'dog style',
'doggie style',
'doggiestyle',
'doggy style',
'doggystyle',
'dolcett',
'domination',
'dominatrix',
'dommes',
'donkey punch',
'double dong',
'double penetration',
'dp action',
'eat my ass',
'ecchi',
'ejaculation',
'erotic',
'erotism',
'escort',
'ethical slut',
'eunuch',
'faggot',
'fecal',
'felch',
'fellatio',
'feltch',
'female squirting',
'femdom',
'figging',
'fingering',
'fisting',
'foot fetish',
'footjob',
'frotting',
'fuck',
'fucking',
'fuck buttons',
'fudge packer',
'fudgepacker',
'futanari',
'g-spot',
'gang bang',
'gay sex',
'genitals',
'giant cock',
'girl on',
'girl on top',
'girls gone wild',
'goatcx',
'goatse',
'gokkun',
'golden shower',
'goo girl',
'goodpoop',
'goregasm',
'grope',
'group sex',
'guro',
'hand job',
'handjob',
'hard core',
'hardcore',
'hentai',
'homoerotic',
'honkey',
'hooker',
'hot chick',
'how to kill',
'how to murder',
'huge fat',
'humping',
'incest',
'intercourse',
'jack off',
'jail bait',
'jailbait',
'jerk off',
'jigaboo',
'jiggaboo',
'jiggerboo',
'jizz',
'juggs',
'kike',
'kinbaku',
'kinkster',
'kinky',
'knobbing',
'leather restraint',
'leather straight jacket',
'lemon party',
'lolita',
'lovemaking',
'make me come',
'male squirting',
'masturbate',
'menage a trois',
'milf',
'missionary position',
'motherfucker',
'mound of venus',
'mr hands',
'muff diver',
'muffdiving',
'nambla',
'nawashi',
'negro',
'neonazi',
'nig nog',
'nigga',
'nigger',
'nimphomania',
'nipple',
'nipples',
'nsfw images',
'nude',
'nudity',
'nympho',
'nymphomania',
'octopussy',
'omorashi',
'one cup two girls',
'one guy one jar',
'orgasm',
'orgy',
'paedophile',
'panties',
'panty',
'pedobear',
'pedophile',
'pegging',
'penis',
'phone sex',
'piece of shit',
'piss pig',
'pissing',
'pisspig',
'playboy',
'pleasure chest',
'pole smoker',
'ponyplay',
'poof',
'poop chute',
'poopchute',
'porn',
'porno',
'pornography',
'prince albert piercing',
'pthc',
'pubes',
'pussy',
'queaf',
'raghead',
'raging boner',
'rape',
'raping',
'rapist',
'rectum',
'reverse cowgirl',
'rimjob',
'rimming',
'rosy palm',
'rosy palm and her 5 sisters',
'rusty trombone',
's&m',
'sadism',
'scat',
'schlong',
'scissoring',
'semen',
'sex',
'sexo',
'sexy',
'shaved beaver',
'shaved pussy',
'shemale',
'shibari',
'shit',
'shota',
'shrimping',
'slanteye',
'slut',
'smut',
'snatch',
'snowballing',
'sodomize',
'sodomy',
'spic',
'spooge',
'spread legs',
'strap on',
'strapon',
'strappado',
'strip club',
'style doggy',
'suck',
'sucks',
'suicide girls',
'sultry women',
'swastika',
'swinger',
'tainted love',
'taste my',
'tea bagging',
'threesome',
'throating',
'tied up',
'tight white',
'tit',
'tits',
'titties',
'titty',
'tongue in a',
'topless',
'tosser',
'towelhead',
'tranny',
'tribadism',
'tub girl',
'tubgirl',
'tushy',
'twat',
'twink',
'twinkie',
'two girls one cup',
'undressing',
'upskirt',
'urethra play',
'urophilia',
'vagina',
'venus mound',
'vibrator',
'violet blue',
'violet wand',
'vorarephilia',
'voyeur',
'vulva',
'wank',
'wet dream',
'wetback',
'white power',
'women rapping',
'wrapping men',
'wrinkled starfish',
'xx',
'xxx',
'yaoi',
'yellow showers',
'yiffy',
'zoophilia']
def profanityFilter(text):
brokenStr1 = text.split()
badWordMask = '!@#$%!@#$%^~!@%^~@#$%!@#$%^~!'
new = ''
for word in brokenStr1:
if word in arrBad:
print word + ' <--Bad word!'
text = text.replace(word,badWordMask[:len(word)])
#print new
return text
print profanityFilter("this thing sucks sucks sucks fucking stuff")
Run Code Online (Sandbox Code Playgroud)
您可以根据需要在坏词列表arrBad中添加或删除.
| 归档时间: |
|
| 查看次数: |
19800 次 |
| 最近记录: |