techykajal/create_dictionary.py

## create_dictionary.py
# define the function to create dictionary and document to term matrix
def create_dic_and_docterm_matrix(Complete_Content, dict_file_path, matrix_file_path):
    """
    This function will create corpus dictionary and document to term matrix

    Argument:
        X: tokenized text corpus
        dict_file_path: file path to save dictionary
        matrix_file_path: file path to save matrix
    returns:
        corpus dictionary and document to term matrix
    """

    # Create Dictionary
    id2word_dic = corpora.Dictionary(Complete_Content)
    # Save Dictionary
    id2word_dic.save(dict_file_path)

    # Create Corpus
    text = Complete_Content # Query here(Should I keep the same corpus after tokenization or update with the one got after POS_tagging)
    #  Document to term Frequency
    doc_term_matrix = [id2word_dic.doc2bow(tokens) for tokens in text]
    # Save Doc-Term matrix
    corpora.MmCorpus.serialize(matrix_file_path, doc_term_matrix)

    return id2word_dic, doc_term_matrix


dict_file_path = r"C:\Users\Kajal\Desktop\BlueThinQ_Project\Topic Modelling\dictionary.txt"
matrix_file_path = r"C:\Users\Kajal\Desktop\BlueThinQ_Project\Topic Modelling\doc_term_matrix.txt"
dic_LDA, doc_term_matrix  = create_dic_and_docterm_matrix(Complete_Content, dict_file_path, matrix_file_path)
	# define the function to create dictionary and document to term matrix
	def create_dic_and_docterm_matrix(Complete_Content, dict_file_path, matrix_file_path):
	"""
	This function will create corpus dictionary and document to term matrix

	Argument:
	X: tokenized text corpus
	dict_file_path: file path to save dictionary
	matrix_file_path: file path to save matrix
	returns:
	corpus dictionary and document to term matrix
	"""

	# Create Dictionary
	id2word_dic = corpora.Dictionary(Complete_Content)
	# Save Dictionary
	id2word_dic.save(dict_file_path)

	# Create Corpus
	text = Complete_Content # Query here(Should I keep the same corpus after tokenization or update with the one got after POS_tagging)
	# Document to term Frequency
	doc_term_matrix = [id2word_dic.doc2bow(tokens) for tokens in text]
	# Save Doc-Term matrix
	corpora.MmCorpus.serialize(matrix_file_path, doc_term_matrix)

	return id2word_dic, doc_term_matrix


	dict_file_path = r"C:\Users\Kajal\Desktop\BlueThinQ_Project\Topic Modelling\dictionary.txt"
	matrix_file_path = r"C:\Users\Kajal\Desktop\BlueThinQ_Project\Topic Modelling\doc_term_matrix.txt"
	dic_LDA, doc_term_matrix = create_dic_and_docterm_matrix(Complete_Content, dict_file_path, matrix_file_path)
No results found