Skip to content

Instantly share code, notes, and snippets.

@kse0202
Last active June 9, 2021 12:52
Show Gist options
  • Select an option

  • Save kse0202/9d3d8d519170064cefdd12fcb718afa0 to your computer and use it in GitHub Desktop.

Select an option

Save kse0202/9d3d8d519170064cefdd12fcb718afa0 to your computer and use it in GitHub Desktop.
python re패키지로 텍스트 전처리

python re패키지를 이용하여 한글 자음, 한글 모음, 특수문자, 이모티콘 삭제하기

import re


def get_clean_text(df):
    text = []
    
    for i in range(0, len(df)):
        if (str(df['column_nm'][i]) =='nan') : # 지우고 싶은 글자가 있는 컬럼 
            temp = ''
        else : 
            temp = df['column_nm'][i]
            temp = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…《\》]', '', temp) # 특수문자
            temp = re.sub('([ㄱ-ㅎㅏ-ㅣ]+)', '', temp) # 한글 자음, 한글 모음
            temp = re.sub('([♡❤✌❣♥ᆢ✊❤️✨⤵️☺️;”“]+)', '', temp) # 이모티콘 
            only_BMP_pattern = re.compile("["
                                u"\U00010000-\U0010FFFF"  #BMP characters 이외
                               "]+", flags=re.UNICODE)
            temp = only_BMP_pattern.sub(r'', temp)# BMP characters만
            emoji_pattern = re.compile("["
                                    u"\U0001F600-\U0001F64F"  # emoticons
                                    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                    u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                    u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                                       "]+", flags=re.UNICODE)
            temp=  emoji_pattern.sub(r'', temp) # 유니코드로 이모티콘 지우기
            text.append(temp)
            
        text1 = " ".join(text)
            
    return text1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment