Uploading the python file used to process the comments of Facebook
coding: utf-8
In[43]:
""" This python file reads the a given CSV file of data, preprocesses it and stores the processed comments into a CSV file ready to be processed by R for emotion classification"""
import os,time, csv import pandas as pd facebook = pd.read_csv("C:/Users/niraj/Downloads/CW/data_lufthansa.csv") #Read the CSV file of Fb data
In[45]:
In[46]:
import os,time
#Set up google translation API
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "C:/Users/niraj/Downloads/MyCodingWeek-3bd33d179218.json"
from google.cloud import translate client = translate.Client()
#Translate every comment automatically to English datatrans={} for i in range(len(facebook['object_link.connections.comments.message'])): #The try-except was used to avoid cases of inconsistencies in the data try: a=client.translate(facebook['object_link.connections.comments.message'][i]) datatrans[i] = a['translatedText'] except: nothing=0 print ((i/len(facebook['object_link.connections.comments.message']))*100) #Keep progess of translation
In[47]:
import re from nltk.corpus import stopwords data=datatrans
#Remove all punctuations for i in range(len(data)): try: data[i] = re.sub(r'[^\w\s]','',datatrans[i]) except: nothing =1
#Remove all numbers for i in range(len(data)): try: data[i] = re.sub(" \d+", " ", data[i]) except: nothing=1
#Remove all stopwords
stopword = stopwords.words('english')
for i in range(len(data)): try: querywords = data[i].split() resultwords = [word for word in querywords if word.lower() not in stopword] data[i] = ' '.join(resultwords) except: nothing=1
#Remove
for i in range(len(data)):
try:
data[i] = re.sub(r'^https?://.[\r\n]', '', data[i], flags=re.MULTILINE)
except:
nothing=1
#Lemmatisation
from textblob import Word
for i in range(len(data)):
try:
resultwords = [Word(y).lemmatize() for y in data[i].split()]
data[i] = ' '.join(resultwords)
except:
nothing=1
#Stemming
from nltk.stem import PorterStemmer
st = PorterStemmer()
for i in range(len(data)):
try:
resultwords = [st.stem(y) for y in data[i].split()]
data[i] = ' '.join(resultwords)
except:
nothing=1
In[48]:
#Write into a csv file so we can process in on R
with open('facebook_comments_processed_LHR.csv','w') as fout:
writer=csv.writer(fout)
for i in range(len(data)):
try:
row=[str(data[i]).encode('utf-8')]
writer.writerows([row])
except:
nothing=1