Lisa Andreevna lisanka93

## training_nb.py
naive_bayes.fit(training_data,y_train)

predictions = naive_bayes.predict(testing_data)

# accuracy: 0.7391304347826086
print('accuracy: {}'.format(accuracy_score(y_test, predictions)))

#using cross-validation
X_whole = count_vector.fit_transform(covid_data['prep_arg'])
y_whole = covid_data['concern']

## countvectoriser.py
count_vectorizer = CountVectorizer(binary=True)
#fit training data
training_data = count_vectorizer.fit_transform(X_train)

#transform test data
testing_data = count_vectorizer.transform(X_test)

## train_test.py
X_train, X_test, y_train, y_test = train_test_split(
    covid_data['prep_arg'],
    covid_data['concern'],
    test_size=0.2,
    random_state=50
)

## read_in_covid_data.py
covid_data = pd.read_csv('covid_vacc_concerns.csv')
covid_data['prep_arg'] = covid_data['arg'].apply(preprocess)

## bow.csv

          
            vocabulary
            apples
            are
            great
            but
            so
            pears
            however
            sometimes
            I
            feel
            like
            oranges
            and
            on
            other
            days
            bananas

            
              index
              1
              2
              3
              4
              5
              6
              7
              8
              9
              10
              11
              12
              13
              14
              15
              16
              17

            
              Sentence 1
              1
              0
              0
              0
              0
              0
              0
              0
              1
              0
              1
              0
              0
              0
              0
              0
              0

            
              Sentence 2
              0
              2
              1
              0
              0
              0
              0
              0
              0
              0
              0
              0
              0
              0
              0
              0
              2

            
              Sentence 3
              0
              0
              0
              0
              0
              0
              0
              0
              1
              0
              0
              1
              0
              0
              0
              0
              0

## Dummy movie dataset.ipynb

      
              1 file
            
          
              1 fork
            
          
                0 comments
              
            
              3 stars
            
          
                lisanka93
                / Dummy movie dataset.ipynb
            
            
              Created
              July 13, 2020 16:12
            
              
                notebook dummy movie dataset
              
          
      Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## lem_stem.py
from nltk.stem import WordNetLemmatizer, PorterStemmer, SnowballStemmer

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

word = "considering"

stemmed_word =  stemmer.stem(word)
lemmatised_word = lemmatizer.lemmatize(word)

## regex_punct.py
import re

#letters only
raw_text = "this is a test. To demonstrate 2 regex expressions!!"
letters_only_text = re.sub("[^a-zA-Z]", " ", raw_text)

#keep numbers
letnum_text =  re.sub("[^a-zA-Z0-9\s]+", " ",raw_text )

## ngrams.py
from nltk.util import ngrams, word_tokenize, bigrams, trigrams

sen = "Dummy sentence to demonstrate bigrams"
nltk_tokens = word_tokenize(sen) #using tokenize from NLKT and not split() because split() does not take into account punctuation

#splitting sentence into bigrams and trigrams
print(list(bigrams(nltk_tokens)))
print(list(trigrams(nltk_tokens)))

#creating a dictionary that shows occurances of n-grams in text

## stopword_removal.py
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

example_sent = "This is a sample sentence, showing off the stop words filtration."
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(example_sent)

filtered_sentence = [w for w in word_tokens if not w in stop_words]
print(filtered_sentence)
	naive_bayes.fit(training_data,y_train)

	predictions = naive_bayes.predict(testing_data)

	# accuracy: 0.7391304347826086
	print('accuracy: {}'.format(accuracy_score(y_test, predictions)))

	#using cross-validation
	X_whole = count_vector.fit_transform(covid_data['prep_arg'])
	y_whole = covid_data['concern']
	count_vectorizer = CountVectorizer(binary=True)
	#fit training data
	training_data = count_vectorizer.fit_transform(X_train)

	#transform test data
	testing_data = count_vectorizer.transform(X_test)
	X_train, X_test, y_train, y_test = train_test_split(
	covid_data['prep_arg'],
	covid_data['concern'],
	test_size=0.2,
	random_state=50
	)
	covid_data = pd.read_csv('covid_vacc_concerns.csv')
	covid_data['prep_arg'] = covid_data['arg'].apply(preprocess)
vocabulary	apples	are	great	but	so	pears	however	sometimes	I	feel	like	oranges	and	on	other	days	bananas
index	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17
Sentence 1	1	0	0	0	0	0	0	0	1	0	1	0	0	0	0	0	0
Sentence 2	0	2	1	0	0	0	0	0	0	0	0	0	0	0	0	0	2
Sentence 3	0	0	0	0	0	0	0	0	1	0	0	1	0	0	0	0	0
	from nltk.stem import WordNetLemmatizer, PorterStemmer, SnowballStemmer

	stemmer = PorterStemmer()
	lemmatizer = WordNetLemmatizer()

	word = "considering"

	stemmed_word = stemmer.stem(word)
	lemmatised_word = lemmatizer.lemmatize(word)
	import re

	#letters only
	raw_text = "this is a test. To demonstrate 2 regex expressions!!"
	letters_only_text = re.sub("[^a-zA-Z]", " ", raw_text)

	#keep numbers
	letnum_text = re.sub("[^a-zA-Z0-9\s]+", " ",raw_text )
	from nltk.util import ngrams, word_tokenize, bigrams, trigrams

	sen = "Dummy sentence to demonstrate bigrams"
	nltk_tokens = word_tokenize(sen) #using tokenize from NLKT and not split() because split() does not take into account punctuation

	#splitting sentence into bigrams and trigrams
	print(list(bigrams(nltk_tokens)))
	print(list(trigrams(nltk_tokens)))

	#creating a dictionary that shows occurances of n-grams in text
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize

	example_sent = "This is a sample sentence, showing off the stop words filtration."
	stop_words = set(stopwords.words('english'))
	word_tokens = word_tokenize(example_sent)

	filtered_sentence = [w for w in word_tokens if not w in stop_words]
	print(filtered_sentence)