-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmodel.py
More file actions
105 lines (80 loc) · 2.92 KB
/
model.py
File metadata and controls
105 lines (80 loc) · 2.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import numpy as np
from hmmlearn import hmm
import pickle
import re
#Returns saved model from file
def loadModel():
with open("hmm.pkl", "rb") as file:
network = pickle.load(file)
return network
#Return language model
def loadLanguage():
with open("languageModel.pkl", "rb") as file:
uniqueWords = pickle.load(file)
with open("wordDictionary.pkl", "rb") as file:
wordDictionary = pickle.load(file)
return uniqueWords, wordDictionary
#Defines network architecture and returns untrained movel
def createModel():
network = hmm.GaussianHMM(n_components=50, covariance_type="full", n_iter=5)
network.monitor_
return network
#Trains model based on data in text file, may be database later
def trainModel(network, languageModel, wordDictionary):
textData = open("textData.txt", "rb")
wordCount = 1
trainingSequences = []
lengths = []
sequenceNumber = 0
for i in textData.readlines():
sequence = []
words = str(i.lower())
words = re.sub("[:;\"]", "", words)
words = re.sub("[\(\)]", " ", words)
words = re.sub("([\.!?,])", " \g<0>", words)
words = words.split(" ")[1:]
for j in words:
sequence.append(wordDictionary[j])
lengths.append(len(words))
sequenceNumber += 1
sequence = np.array(sequence)
trainingSequences.append(sequence)
print("Finished creating training sequence data")
#TEMPORARY FOR RESTRICTING AMOUNT OF DATA TO TRAIN ON
trainingSequences = trainingSequences[:100]
lengths = lengths[:100]
trainingSequences = np.concatenate(trainingSequences)
trainingSequences = trainingSequences.reshape(-1, 1)
print("Finished converting to numpy")
#print(trainingSequences[-10:])
print(trainingSequences.shape)
network.fit(trainingSequences, lengths)
with open("hmm.pkl", "wb") as file:
pickle.dump(network, file)
return network
#Builds a language model for the HMM
def buildLanguageModelFromText():
uniqueWords = []
wordDictionary = {}
textData = open("textData.txt", "rb")
wordCount = 1
for i in textData.readlines():
words = str(i.lower())
words = re.sub("[:;\"]", "", words)
words = re.sub("[\(\)]", " ", words)
words = re.sub("([\.!?,])", " \g<0>", words)
words = words.split(" ")[1:]
for j in words:
try:
exists = wordDictionary[j]
except:
#If a new word isn't in the uniquewords tracked, add it
uniqueWords.append(j)
wordDictionary[j] = len(uniqueWords) - 1
print(uniqueWords[-10:])
print("finished generating language model")
with open("languageModel.pkl", "wb") as file:
pickle.dump(uniqueWords, file)
with open("wordDictionary.pkl", "wb") as file:
pickle.dump(wordDictionary, file)
return uniqueWords, wordDictionary