Skip to content

Commit

Permalink
add NCRFpp code
Browse files Browse the repository at this point in the history
  • Loading branch information
cjer committed May 5, 2021
1 parent f691ed2 commit e8bd1b2
Show file tree
Hide file tree
Showing 16 changed files with 2,981 additions and 0 deletions.
1 change: 1 addition & 0 deletions model/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__author__ = 'max'
77 changes: 77 additions & 0 deletions model/charbigru.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# -*- coding: utf-8 -*-
# @Author: Jie Yang
# @Date: 2017-10-17 16:47:32
# @Last Modified by: Jie Yang, Contact: [email protected]
# @Last Modified time: 2018-10-18 11:12:13
from __future__ import print_function
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import numpy as np

class CharBiGRU(nn.Module):
def __init__(self, alphabet_size, pretrain_char_embedding, embedding_dim, hidden_dim, dropout, gpu, bidirect_flag = True):
super(CharBiGRU, self).__init__()
print("build char sequence feature extractor: GRU ...")
self.gpu = gpu
self.hidden_dim = hidden_dim
if bidirect_flag:
self.hidden_dim = hidden_dim // 2
self.char_drop = nn.Dropout(dropout)
self.char_embeddings = nn.Embedding(alphabet_size, embedding_dim)
if pretrain_char_embedding is not None:
self.char_embeddings.weight.data.copy_(torch.from_numpy(pretrain_char_embedding))
else:
self.char_embeddings.weight.data.copy_(torch.from_numpy(self.random_embedding(alphabet_size, embedding_dim)))
self.char_lstm = nn.GRU(embedding_dim, self.hidden_dim, num_layers=1, batch_first=True, bidirectional=bidirect_flag)
if self.gpu:
self.char_drop = self.char_drop.cuda()
self.char_embeddings = self.char_embeddings.cuda()
self.char_lstm = self.char_lstm.cuda()


def random_embedding(self, vocab_size, embedding_dim):
pretrain_emb = np.empty([vocab_size, embedding_dim])
scale = np.sqrt(3.0 / embedding_dim)
for index in range(vocab_size):
pretrain_emb[index,:] = np.random.uniform(-scale, scale, [1, embedding_dim])
return pretrain_emb


def get_last_hiddens(self, input, seq_lengths):
"""
input:
input: Variable(batch_size, word_length)
seq_lengths: numpy array (batch_size, 1)
output:
Variable(batch_size, char_hidden_dim)
Note it only accepts ordered (length) variable, length size is recorded in seq_lengths
"""
batch_size = input.size(0)
char_embeds = self.char_drop(self.char_embeddings(input))
char_hidden = None
pack_input = pack_padded_sequence(char_embeds, seq_lengths, True)
char_rnn_out, char_hidden = self.char_lstm(pack_input, char_hidden)
# char_rnn_out, _ = pad_packed_sequence(char_rnn_out)
return char_hidden.transpose(1,0).contiguous().view(batch_size,-1)

def get_all_hiddens(self, input, seq_lengths):
"""
input:
input: Variable(batch_size, word_length)
seq_lengths: numpy array (batch_size, 1)
output:
Variable(batch_size, word_length, char_hidden_dim)
Note it only accepts ordered (length) variable, length size is recorded in seq_lengths
"""
batch_size = input.size(0)
char_embeds = self.char_drop(self.char_embeddings(input))
char_hidden = None
pack_input = pack_padded_sequence(char_embeds, seq_lengths, True)
char_rnn_out, char_hidden = self.char_lstm(pack_input, char_hidden)
char_rnn_out, _ = pad_packed_sequence(char_rnn_out)
return char_rnn_out.transpose(1,0)


def forward(self, input, seq_lengths):
return self.get_all_hiddens(input, seq_lengths)
79 changes: 79 additions & 0 deletions model/charbilstm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# -*- coding: utf-8 -*-
# @Author: Jie Yang
# @Date: 2017-10-17 16:47:32
# @Last Modified by: Jie Yang, Contact: [email protected]
# @Last Modified time: 2018-10-18 11:19:37
from __future__ import print_function
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import numpy as np

class CharBiLSTM(nn.Module):
def __init__(self, alphabet_size, pretrain_char_embedding, embedding_dim, hidden_dim, dropout, gpu, bidirect_flag = True):
super(CharBiLSTM, self).__init__()
print("build char sequence feature extractor: LSTM ...")
self.gpu = gpu
self.hidden_dim = hidden_dim
if bidirect_flag:
self.hidden_dim = hidden_dim // 2
self.char_drop = nn.Dropout(dropout)
self.char_embeddings = nn.Embedding(alphabet_size, embedding_dim)
if pretrain_char_embedding is not None:
self.char_embeddings.weight.data.copy_(torch.from_numpy(pretrain_char_embedding))
else:
self.char_embeddings.weight.data.copy_(torch.from_numpy(self.random_embedding(alphabet_size, embedding_dim)))
self.char_lstm = nn.LSTM(embedding_dim, self.hidden_dim, num_layers=1, batch_first=True, bidirectional=bidirect_flag)
if self.gpu:
self.char_drop = self.char_drop.cuda()
self.char_embeddings = self.char_embeddings.cuda()
self.char_lstm = self.char_lstm.cuda()


def random_embedding(self, vocab_size, embedding_dim):
pretrain_emb = np.empty([vocab_size, embedding_dim])
scale = np.sqrt(3.0 / embedding_dim)
for index in range(vocab_size):
pretrain_emb[index,:] = np.random.uniform(-scale, scale, [1, embedding_dim])
return pretrain_emb


def get_last_hiddens(self, input, seq_lengths):
"""
input:
input: Variable(batch_size, word_length)
seq_lengths: numpy array (batch_size, 1)
output:
Variable(batch_size, char_hidden_dim)
Note it only accepts ordered (length) variable, length size is recorded in seq_lengths
"""
batch_size = input.size(0)
char_embeds = self.char_drop(self.char_embeddings(input))
char_hidden = None
pack_input = pack_padded_sequence(char_embeds, seq_lengths, True)
char_rnn_out, char_hidden = self.char_lstm(pack_input, char_hidden)
## char_hidden = (h_t, c_t)
# char_hidden[0] = h_t = (2, batch_size, lstm_dimension)
# char_rnn_out, _ = pad_packed_sequence(char_rnn_out)
return char_hidden[0].transpose(1,0).contiguous().view(batch_size,-1)

def get_all_hiddens(self, input, seq_lengths):
"""
input:
input: Variable(batch_size, word_length)
seq_lengths: numpy array (batch_size, 1)
output:
Variable(batch_size, word_length, char_hidden_dim)
Note it only accepts ordered (length) variable, length size is recorded in seq_lengths
"""
batch_size = input.size(0)
char_embeds = self.char_drop(self.char_embeddings(input))
char_hidden = None
pack_input = pack_padded_sequence(char_embeds, seq_lengths, True)
char_rnn_out, char_hidden = self.char_lstm(pack_input, char_hidden)
char_rnn_out, _ = pad_packed_sequence(char_rnn_out)
return char_rnn_out.transpose(1,0)


def forward(self, input, seq_lengths):
return self.get_all_hiddens(input, seq_lengths)
74 changes: 74 additions & 0 deletions model/charcnn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# -*- coding: utf-8 -*-
# @Author: Jie Yang
# @Date: 2017-10-17 16:47:32
# @Last Modified by: Jie Yang, Contact: [email protected]
# @Last Modified time: 2019-01-18 21:06:06
from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

class CharCNN(nn.Module):
def __init__(self, alphabet_size, pretrain_char_embedding, embedding_dim, hidden_dim, dropout, gpu, kernel_size):
super(CharCNN, self).__init__()
print("build char sequence feature extractor: CNN ...")
self.gpu = gpu
self.hidden_dim = hidden_dim
self.char_drop = nn.Dropout(dropout)
self.char_embeddings = nn.Embedding(alphabet_size, embedding_dim)
self.kernel_size = kernel_size
if pretrain_char_embedding is not None:
self.char_embeddings.weight.data.copy_(torch.from_numpy(pretrain_char_embedding))
else:
self.char_embeddings.weight.data.copy_(torch.from_numpy(self.random_embedding(alphabet_size, embedding_dim)))
self.char_cnn = nn.Conv1d(embedding_dim, self.hidden_dim, kernel_size=self.kernel_size, padding=int(self.kernel_size/2))
if self.gpu:
self.char_drop = self.char_drop.cuda()
self.char_embeddings = self.char_embeddings.cuda()
self.char_cnn = self.char_cnn.cuda()


def random_embedding(self, vocab_size, embedding_dim):
pretrain_emb = np.empty([vocab_size, embedding_dim])
scale = np.sqrt(3.0 / embedding_dim)
for index in range(vocab_size):
pretrain_emb[index,:] = np.random.uniform(-scale, scale, [1, embedding_dim])
return pretrain_emb


def get_last_hiddens(self, input, seq_lengths):
"""
input:
input: Variable(batch_size, word_length)
seq_lengths: numpy array (batch_size, 1)
output:
Variable(batch_size, char_hidden_dim)
Note it only accepts ordered (length) variable, length size is recorded in seq_lengths
"""
batch_size = input.size(0)
char_embeds = self.char_drop(self.char_embeddings(input))
char_embeds = char_embeds.transpose(2,1).contiguous()
char_cnn_out = self.char_cnn(char_embeds)
char_cnn_out = F.max_pool1d(char_cnn_out, char_cnn_out.size(2)).view(batch_size, -1)
return char_cnn_out

def get_all_hiddens(self, input, seq_lengths):
"""
input:
input: Variable(batch_size, word_length)
seq_lengths: numpy array (batch_size, 1)
output:
Variable(batch_size, word_length, char_hidden_dim)
Note it only accepts ordered (length) variable, length size is recorded in seq_lengths
"""
batch_size = input.size(0)
char_embeds = self.char_drop(self.char_embeddings(input))
char_embeds = char_embeds.transpose(2,1).contiguous()
char_cnn_out = self.char_cnn(char_embeds).transpose(2,1).contiguous()
return char_cnn_out



def forward(self, input, seq_lengths):
return self.get_all_hiddens(input, seq_lengths)
Loading

0 comments on commit e8bd1b2

Please sign in to comment.