PromptBoxBackend/model.py at main · Prompt-Box/PromptBoxBackend · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import numpy as np
from hmmlearn import hmm
import pickle
import re

#Returns saved model from file
def loadModel():

    with open("hmm.pkl", "rb") as file:
        network = pickle.load(file)

    return network

#Return language model
def loadLanguage():
    with open("languageModel.pkl", "rb") as file:
        uniqueWords = pickle.load(file)
    with open("wordDictionary.pkl", "rb") as file:
        wordDictionary = pickle.load(file)

    return uniqueWords, wordDictionary

#Defines network architecture and returns untrained movel
def createModel():
    network = hmm.GaussianHMM(n_components=50, covariance_type="full", n_iter=5)
    network.monitor_
    return network

#Trains model based on data in text file, may be database later
def trainModel(network, languageModel, wordDictionary):
    textData = open("textData.txt", "rb")
    wordCount = 1

    trainingSequences = []
    lengths = []

    sequenceNumber = 0
    for i in textData.readlines():
        sequence = []

        words = str(i.lower())
        words = re.sub("[:;\"]", "", words)
        words = re.sub("[\(\)]", " ", words)
        words = re.sub("([\.!?,])", " \g<0>", words)

        words = words.split(" ")[1:]
        for j in words:
            sequence.append(wordDictionary[j])

        lengths.append(len(words))
        sequenceNumber += 1
        sequence = np.array(sequence)
        trainingSequences.append(sequence)

    print("Finished creating training sequence data")

    #TEMPORARY FOR RESTRICTING AMOUNT OF DATA TO TRAIN ON
    trainingSequences = trainingSequences[:100]
    lengths = lengths[:100]

    trainingSequences = np.concatenate(trainingSequences)
    trainingSequences = trainingSequences.reshape(-1, 1)

    print("Finished converting to numpy")
    #print(trainingSequences[-10:])
    print(trainingSequences.shape)

    network.fit(trainingSequences, lengths)

    with open("hmm.pkl", "wb") as file:
        pickle.dump(network, file)

    return network

#Builds a language model for the HMM
def buildLanguageModelFromText():
    uniqueWords = []
    wordDictionary = {}
    textData = open("textData.txt", "rb")
    wordCount = 1

    for i in textData.readlines():
        words = str(i.lower())
        words = re.sub("[:;\"]", "", words)
        words = re.sub("[\(\)]", " ", words)
        words = re.sub("([\.!?,])", " \g<0>", words)

        words = words.split(" ")[1:]
        for j in words:
            try:
                exists = wordDictionary[j]
            except:
                #If a new word isn't in the uniquewords tracked, add it
                uniqueWords.append(j)
                wordDictionary[j] = len(uniqueWords) - 1

    print(uniqueWords[-10:])
    print("finished generating language model")

    with open("languageModel.pkl", "wb") as file:
        pickle.dump(uniqueWords, file)
    with open("wordDictionary.pkl", "wb") as file:
        pickle.dump(wordDictionary, file)

    return uniqueWords, wordDictionary