hw1

.py

School

University of Massachusetts, Amherst *

*We aren’t endorsed by this school

Course

325

Subject

Computer Science

Date

Apr 24, 2024

Type

Pages

Uploaded by LieutenantKnowledge1057 on coursehero.com

import matplotlib.pyplot as plt import math import os import time import operator from collections import defaultdict # Global class labels. POS_LABEL = 'pos' NEG_LABEL = 'neg' ###### DO NOT MODIFY THIS FUNCTION ##### def tokenize_doc(doc): """ Tokenize a document and return its bag-of-words representation. doc - a string representing a document. returns a dictionary mapping each word to the number of times it appears in doc. """ bow = defaultdict(float) tokens = doc.split() lowered_tokens = map(lambda t: t.lower(), tokens) for token in lowered_tokens: bow[token] += 1.0 return dict(bow) ###### END FUNCTION ##### def n_word_types(word_counts): ''' return a count of all word types in the corpus using information from word_counts ''' ## TODO: complete me! return len(word_counts) def n_word_tokens(word_counts): ''' return a count of all word tokens in the corpus using information from word_counts ''' res = 0 ## TODO: complete me! for i in word_counts: res += word_counts[i] return res class NaiveBayes: """A Naive Bayes model for text classification.""" def __init__(self, path_to_data, tokenizer): # Vocabulary is a set that stores every word seen in the training data self.vocab = set()

self.path_to_data = path_to_data self.tokenize_doc = tokenizer self.train_dir = os.path.join(path_to_data, "train") self.test_dir = os.path.join(path_to_data, "test") # class_total_doc_counts is a dictionary that maps a class (i.e., pos/neg) to # the number of documents in the trainning set of that class self.class_total_doc_counts = { POS_LABEL: 0.0, NEG_LABEL: 0.0 } # class_total_word_counts is a dictionary that maps a class (i.e., pos/neg) to # the number of words in the training set in documents of that class self.class_total_word_counts = { POS_LABEL: 0.0, NEG_LABEL: 0.0 } # class_word_counts is a dictionary of dictionaries. It maps a class (i.e., # pos/neg) to a dictionary of word counts. For example: # self.class_word_counts[POS_LABEL]['awesome'] # stores the number of times the word 'awesome' appears in documents # of the positive class in the training documents. self.class_word_counts = { POS_LABEL: defaultdict(float), NEG_LABEL: defaultdict(float) } def train_model(self): """ This function processes the entire training set using the global PATH variable above. It makes use of the tokenize_doc and update_model functions you will implement. """ pos_path = os.path.join(self.train_dir, POS_LABEL) neg_path = os.path.join(self.train_dir, NEG_LABEL) for (p, label) in [ (pos_path, POS_LABEL), (neg_path, NEG_LABEL) ]: for f in os.listdir(p): with open(os.path.join(p,f),'r') as doc: content = doc.read() self.tokenize_and_update_model(content, label) self.report_statistics_after_training() def report_statistics_after_training(self): """ Report a number of statistics after training. """ print ("REPORTING CORPUS STATISTICS") print ("NUMBER OF DOCUMENTS IN POSITIVE CLASS:", self.class_total_doc_counts[POS_LABEL]) print ("NUMBER OF DOCUMENTS IN NEGATIVE CLASS:", self.class_total_doc_counts[NEG_LABEL]) print ("NUMBER OF TOKENS IN POSITIVE CLASS:", self.class_total_word_counts[POS_LABEL]) print ("NUMBER OF TOKENS IN NEGATIVE CLASS:", self.class_total_word_counts[NEG_LABEL]) print ("VOCABULARY SIZE: NUMBER OF UNIQUE WORDTYPES IN TRAINING CORPUS:", len(self.vocab)) def update_model(self, bow, label): """

IMPLEMENT ME! Update internal statistics given a document represented as a bag-of-words bow - a map from words to their counts label - the class of the document whose bag-of-words representation was input This function doesn't return anything but should update a number of internal statistics. Specifically, it updates: - the internal map the counts, per class, how many times each word was seen (self.class_word_counts) - the number of words seen for each label (self.class_total_word_counts) - the vocabulary seen so far (self.vocab) - the number of documents seen of each label (self.class_total_doc_counts) """ self.class_total_doc_counts[label] += 1.0 self.class_total_word_counts[label] += sum(bow.values()) for word, count in bow.items(): self.class_word_counts[label][word] += count self.vocab.add(word) def tokenize_and_update_model(self, doc, label): """ IMPLEMENT ME! Tokenizes a document doc and updates internal count statistics. doc - a string representing a document. label - the sentiment of the document (either postive or negative) Make sure when tokenizing to lower case all of the tokens! """ tokens = doc.lower().split() bow = defaultdict(float) for word in tokens: bow[word] += 1.0 self.update_model(bow, label) def top_n(self, label, n): """ Implement me! Returns the most frequent n tokens for documents with class 'label'. """ word_counts = self.class_word_counts[label] top_n_words = (sorted(word_counts.items(), key=lambda x: x[1], reverse=True))[:n] return top_n_words def p_word_given_label(self, word, label): """ Implement me! Returns the probability of word given label according to this NB model. """ word_count = self.class_word_counts[label][word] total_word_count = self.class_total_word_counts[label]

Your preview ends here

Eager to read complete document? Join bartleby learn and gain access to the full version