Srinivas Niraj requested to merge patch-4 into Clean_comment Nov 23, 2018

coding: utf-8

In[43]:

""" This python file reads the a given CSV file of data, preprocesses it and stores the processed comments into a CSV file ready to be processed by R for emotion classification"""

import os,time, csv import pandas as pd facebook = pd.read_csv("C:/Users/niraj/Downloads/CW/data_lufthansa.csv") #Read the CSV file of Fb data

In[45]:

In[46]:

import os,time
#Set up google translation API os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "C:/Users/niraj/Downloads/MyCodingWeek-3bd33d179218.json"

from google.cloud import translate client = translate.Client()

#Translate every comment automatically to English datatrans={} for i in range(len(facebook['object_link.connections.comments.message'])): #The try-except was used to avoid cases of inconsistencies in the data try: a=client.translate(facebook['object_link.connections.comments.message'][i]) datatrans[i] = a['translatedText'] except: nothing=0 print ((i/len(facebook['object_link.connections.comments.message']))*100) #Keep progess of translation

In[47]:

import re from nltk.corpus import stopwords data=datatrans

#Remove all punctuations for i in range(len(data)): try: data[i] = re.sub(r'[^\w\s]','',datatrans[i]) except: nothing =1

#Remove all numbers for i in range(len(data)): try: data[i] = re.sub(" \d+", " ", data[i]) except: nothing=1

#Remove all stopwords
stopword = stopwords.words('english')

for i in range(len(data)): try: querywords = data[i].split() resultwords = [word for word in querywords if word.lower() not in stopword] data[i] = ' '.join(resultwords) except: nothing=1

#Remove for i in range(len(data)): try: data[i] = re.sub(r'^https?://.[\r\n]', '', data[i], flags=re.MULTILINE) except: nothing=1 #Lemmatisation
from textblob import Word for i in range(len(data)): try: resultwords = [Word(y).lemmatize() for y in data[i].split()] data[i] = ' '.join(resultwords) except: nothing=1

#Stemming
from nltk.stem import PorterStemmer st = PorterStemmer() for i in range(len(data)): try: resultwords = [st.stem(y) for y in data[i].split()] data[i] = ' '.join(resultwords) except: nothing=1

In[48]:

#Write into a csv file so we can process in on R

with open('facebook_comments_processed_LHR.csv','w') as fout: writer=csv.writer(fout)
for i in range(len(data)): try: row=[str(data[i]).encode('utf-8')] writer.writerows([row]) except: nothing=1

Uploading the python file used to process the comments of Facebook