-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathutils.py
More file actions
83 lines (72 loc) · 2.9 KB
/
utils.py
File metadata and controls
83 lines (72 loc) · 2.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File : utils.py
# @Author: LauTrueYes
# @Date : 2022/7/30 20:35
import json
import torch
def load_dataset(file_path):
dataset = []
with open(file_path, 'r', encoding='utf-8') as file:
for line in file.readlines():
item = json.loads(line)
dataset.append(item)
return dataset
class Vocab(object):
def __init__(self):
self.id2word = None
self.word2id = {'PAD':0}
def add(self, dataset):
id = len(self.word2id)
for item in dataset:
for word in item['text']:
if word not in self.word2id:
self.word2id.update({word: id})
id += 1
self.id2word = {j: i for i, j in self.word2id.items()}
def __len__(self):
return len(self.word2id)
class DataLoader(object):
def __init__(self, dataset, batch_size):
self.dataset = dataset
self.batch_size = batch_size
def __iter__(self):
batch = []
for index in range(len(self.dataset)):
batch.append(self.dataset[index])
if len(batch) == self.batch_size:
yield batch
batch = []
if len(batch):
yield batch
def __len__(self):
return (len(self.dataset) + self.batch_size - 1) // self.batch_size
def batch_variable(batch_data, vocab, config):
sentence_list = []
labels_list = []
for index, item in enumerate(batch_data):
sentence = item['text']
labels = ['O'] * len(sentence)
for label_name, tag in item['label'].items():
for sub_name, sub_index in tag.items():
for start_index, end_index in sub_index:
assert ''.join(sentence[start_index:end_index + 1]) == sub_name
if start_index == end_index:
labels[start_index] = 'B-' + label_name
else:
labels[start_index] = 'B-' + label_name
labels[start_index + 1:end_index + 1] = ['I-' + label_name] * (len(sub_name) - 1)
sentence_list.append(sentence)
labels_list.append(labels)
batch_size = len(batch_data)
max_seq_len = max(len(insts['text']) for insts in batch_data)
word_ids = torch.zeros((batch_size, max_seq_len), dtype=torch.long)
label_ids = torch.zeros((batch_size, max_seq_len), dtype=torch.long)
label_mask = torch.zeros((batch_size, max_seq_len), dtype=torch.bool)
index = 0
for sentence, labels in zip(sentence_list, labels_list):
word_ids[index, :len(sentence)] = torch.tensor([vocab.word2id[word] for word in sentence])
label_ids[index, :len(labels)] = torch.tensor([config.label2id[label] for label in labels])
label_mask[index, :len(labels)].fill_(1)
index += 1
return word_ids.to(config.device), label_ids.to(config.device), label_mask.to(config.device)