hw1

.py

School

University of Massachusetts, Amherst *

*We aren’t endorsed by this school

Course

325

Subject

Computer Science

Date

Apr 24, 2024

Type

py

Pages

5

Uploaded by LieutenantKnowledge1057 on coursehero.com

import matplotlib.pyplot as plt import math import os import time import operator from collections import defaultdict # Global class labels. POS_LABEL = 'pos' NEG_LABEL = 'neg' ###### DO NOT MODIFY THIS FUNCTION ##### def tokenize_doc(doc): """ Tokenize a document and return its bag-of-words representation. doc - a string representing a document. returns a dictionary mapping each word to the number of times it appears in doc. """ bow = defaultdict(float) tokens = doc.split() lowered_tokens = map(lambda t: t.lower(), tokens) for token in lowered_tokens: bow[token] += 1.0 return dict(bow) ###### END FUNCTION ##### def n_word_types(word_counts): ''' return a count of all word types in the corpus using information from word_counts ''' ## TODO: complete me! return len(word_counts) def n_word_tokens(word_counts): ''' return a count of all word tokens in the corpus using information from word_counts ''' res = 0 ## TODO: complete me! for i in word_counts: res += word_counts[i] return res class NaiveBayes: """A Naive Bayes model for text classification.""" def __init__(self, path_to_data, tokenizer): # Vocabulary is a set that stores every word seen in the training data self.vocab = set()
self.path_to_data = path_to_data self.tokenize_doc = tokenizer self.train_dir = os.path.join(path_to_data, "train") self.test_dir = os.path.join(path_to_data, "test") # class_total_doc_counts is a dictionary that maps a class (i.e., pos/neg) to # the number of documents in the trainning set of that class self.class_total_doc_counts = { POS_LABEL: 0.0, NEG_LABEL: 0.0 } # class_total_word_counts is a dictionary that maps a class (i.e., pos/neg) to # the number of words in the training set in documents of that class self.class_total_word_counts = { POS_LABEL: 0.0, NEG_LABEL: 0.0 } # class_word_counts is a dictionary of dictionaries. It maps a class (i.e., # pos/neg) to a dictionary of word counts. For example: # self.class_word_counts[POS_LABEL]['awesome'] # stores the number of times the word 'awesome' appears in documents # of the positive class in the training documents. self.class_word_counts = { POS_LABEL: defaultdict(float), NEG_LABEL: defaultdict(float) } def train_model(self): """ This function processes the entire training set using the global PATH variable above. It makes use of the tokenize_doc and update_model functions you will implement. """ pos_path = os.path.join(self.train_dir, POS_LABEL) neg_path = os.path.join(self.train_dir, NEG_LABEL) for (p, label) in [ (pos_path, POS_LABEL), (neg_path, NEG_LABEL) ]: for f in os.listdir(p): with open(os.path.join(p,f),'r') as doc: content = doc.read() self.tokenize_and_update_model(content, label) self.report_statistics_after_training() def report_statistics_after_training(self): """ Report a number of statistics after training. """ print ("REPORTING CORPUS STATISTICS") print ("NUMBER OF DOCUMENTS IN POSITIVE CLASS:", self.class_total_doc_counts[POS_LABEL]) print ("NUMBER OF DOCUMENTS IN NEGATIVE CLASS:", self.class_total_doc_counts[NEG_LABEL]) print ("NUMBER OF TOKENS IN POSITIVE CLASS:", self.class_total_word_counts[POS_LABEL]) print ("NUMBER OF TOKENS IN NEGATIVE CLASS:", self.class_total_word_counts[NEG_LABEL]) print ("VOCABULARY SIZE: NUMBER OF UNIQUE WORDTYPES IN TRAINING CORPUS:", len(self.vocab)) def update_model(self, bow, label): """
IMPLEMENT ME! Update internal statistics given a document represented as a bag-of-words bow - a map from words to their counts label - the class of the document whose bag-of-words representation was input This function doesn't return anything but should update a number of internal statistics. Specifically, it updates: - the internal map the counts, per class, how many times each word was seen (self.class_word_counts) - the number of words seen for each label (self.class_total_word_counts) - the vocabulary seen so far (self.vocab) - the number of documents seen of each label (self.class_total_doc_counts) """ self.class_total_doc_counts[label] += 1.0 self.class_total_word_counts[label] += sum(bow.values()) for word, count in bow.items(): self.class_word_counts[label][word] += count self.vocab.add(word) def tokenize_and_update_model(self, doc, label): """ IMPLEMENT ME! Tokenizes a document doc and updates internal count statistics. doc - a string representing a document. label - the sentiment of the document (either postive or negative) Make sure when tokenizing to lower case all of the tokens! """ tokens = doc.lower().split() bow = defaultdict(float) for word in tokens: bow[word] += 1.0 self.update_model(bow, label) def top_n(self, label, n): """ Implement me! Returns the most frequent n tokens for documents with class 'label'. """ word_counts = self.class_word_counts[label] top_n_words = (sorted(word_counts.items(), key=lambda x: x[1], reverse=True))[:n] return top_n_words def p_word_given_label(self, word, label): """ Implement me! Returns the probability of word given label according to this NB model. """ word_count = self.class_word_counts[label][word] total_word_count = self.class_total_word_counts[label]
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
  • Access to all documents
  • Unlimited textbook solutions
  • 24/7 expert homework help