Unstructured-Classification Hands-On Solutions 55943

Unstructured-Classification Hands-On Solutions -- Fresco Play

Keep mind the indentation of loops and functions ,if program do not run it is may be due to because of indentation

Source Code

The course id of Unstructured-Classification 55943

Install --> Test --> Run --> Open Preview

Copy url and paste in next tab

click on unstructured_test.ipynb

Step1:-

import pandas as pd

import numpy as np

import csv

Step2:-

#Data Loading

imdb=pd.read_csv("imdb.csv")

imdb.columns = ["index","text","label"]

print(imdb.head(5))

Step3:-

data_size = imdb.shape

print(data_size)

imdb_col_names = list(imdb.columns)

print(imdb_col_names)

print(imdb.groupby('label').describe())

print(imdb.head(3))

Step4:-

imdb_target=imdb['label']

print(imdb_target)

Step5:-

from nltk.tokenize import word_tokenize

import nltk

nltk.download('all')

def split_tokens(text):

text = text.lower()

word_tokens = word_tokenize(text)

return word_tokens

imdb['tokenized_message'] = imdb.apply(lambda row: split_tokens(row['text']), axis = 1)

Step 6:-

from nltk.stem.wordnet import WordNetLemmatizer

def split_into_lemmas(text):

lemma = []

lemmatizer = WordNetLemmatizer()

for word in text:

a=lemmatizer.lemmatize(word)

lemma.append(a)

return lemma

imdb['lemmatized_message'] = imdb.apply(lambda row: split_into_lemmas(row['tokenized_message']),axis=1)

print('Tokenized message:', imdb['tokenized_message'][55])

print('Lemmatized message:', imdb['lemmatized_message'][55])

Step 7:-

from nltk.corpus import stopwords

def stopword_removal(text):

stop_words = set(stopwords.words('english'))

filtered_sentence = []

filtered_sentence = ' '.join([word for word in text if word not in stop_words])

return filtered_sentence

imdb['preprocessed_message'] = imdb.apply(lambda row: stopword_removal(row['lemmatized_message']),axis = 1)

print('Preprocessed message:',imdb['preprocessed_message'])

Training_data=pd.Series(list(imdb['preprocessed_message']))

Training_label=pd.Series(list(imdb['label']))

Step 8:-

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

tf_vectorizer = CountVectorizer(ngram_range = (1,2), min_df = (1/len(Training_label)),max_df = 0.7)

Total_Dictionary_TDM = tf_vectorizer.fit(Training_data)

message_data_TDM = Total_Dictionary_TDM.transform(Training_data)

Step 9:-

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(ngram_range = (1,2), min_df = (1/len(Training_label)),max_df = 0.7)

Total_Dictionary_TFIDF = tfidf_vectorizer.fit(Training_data)

message_data_TFIDF = Total_Dictionary_TFIDF.transform(Training_data)

Step 10:-

from sklearn.model_selection import train_test_split#Splitting the data for training and testing

train_data,test_data, train_label, test_label = train_test_split(message_data_TDM,Training_label,test_size = 0.1)

Step 11:-

seed=9

from sklearn.svm import SVC

train_data_shape = train_data.shape

test_data_shape = test_data.shape

print("The shape of train data", train_data_shape)

print("The shape of test data", test_data_shape )

classifier = SVC(kernel="linear",C=0.025,random_state=seed)

classifier = classifier.fit(train_data,train_label)

#target =

score = classifier.fit(train_data,train_label)

print('SVM Classifier : ',score)

with open('output.txt', 'w') as file:

file.write(str((imdb['tokenized_message'][55],imdb['lemmatized_message'][55])))

Step 12:-

from sklearn.linear_model import SGDClassifier

train_data,test_data, train_label, test_label = train_test_split( message_data_TDM, Training_label, test_size = 0.2)

train_data_shape = train_data.shape

test_data_shape = test_data.shape

print("The shape of train data", train_data_shape )

print("The shape of test data", test_data_shape )

classifier = SGDClassifier( loss='modified_huber',shuffle = True, random_state = seed )

classifier = classifier.fit(train_data,train_label)

#target=

score = classifier.score(test_data,test_label)

print('SGD classifier : ',score)

with open('output1.txt', 'w') as file:

file.write(str((imdb['preprocessed_message'][55])))

If you have any queries, please feel free to ask on the comment section.

IOT CVGK

Breaking News

Unstructured-Classification Hands-On Solutions 55943

Keep mind the indentation of loops and functions ,if program do not run it is may be due to because of indentation

Source Code

No comments

Author

Post of the Day

Blog Archive

Search This Blog

latest offers

New Offers

trending

Popular Posts

Followers

Tags

CPA Offers

Random Posts