-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconvam.h
148 lines (141 loc) · 6.22 KB
/
convam.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#ifndef CONVAM_H_
#define CONVAM_H_
#include <unsupported/Eigen/CXX11/Tensor>
#include "approx_mul_lut.h"
//#include <tensorflow/core/framework/op_kernel.h>
//#include <fstream>
//typedef unsigned long long cudaTextureObject_t;
//class approx_mul_lut_base {
//public:
//explicit approx_mul_lut_base(tensorflow::OpKernelConstruction* context) :
//mant_mul_lut_{0}, exp_mul_lut_{0} {
//load_lut_binary();
//}
//virtual ~approx_mul_lut_base() = default;
//// same for both CPU and GPU
//auto load_lut_binary() -> void {
//// open mant mul file
//std::ifstream file("mbmmantmul16.bin", std::ios::in | std::ios::binary);
//if(file.fail()) {
//std::cerr << "file mbmmantmul16.bin failed" << std::endl;
//exit(1);
//}
//if(!file.is_open()) {
//std::cerr << "file mbmmantmul16.bin open failed" << std::endl;
//exit(1);
//}
//mant_mul_lut_.resize(128*128);
//file.read(
//reinterpret_cast<char *>(mant_mul_lut_.data()),
//mant_mul_lut_.size() * sizeof(uint32_t)
//);
//// open exponent file
//std::ifstream exp_file("exp.bin", std::ios::in|std::ios::binary);
//if(exp_file.fail()) {
//std::cerr << "file exp.bin failed" << std::endl;
//exit(1);
//}
//if(!exp_file.is_open()) {
//std::cerr << "file exp.bin open failed" << std::endl;
//exit(1);
//}
//exp_mul_lut_.resize(2*2*256);
//exp_file.read(
//reinterpret_cast<char *>(exp_mul_lut_.data()),
//exp_mul_lut_.size() * sizeof(uint32_t)
//);
//}
//auto get_mant_mul_lut_text_() -> cudaTextureObject_t& {
//return mant_mul_lut_text_;
//}
//auto get_exp_mul_lut_text_() -> cudaTextureObject_t& {
//return exp_mul_lut_text_;
//}
//protected:
//std::vector<uint32_t> mant_mul_lut_;
//std::vector<uint32_t> exp_mul_lut_;
//uint32_t* mant_mul_lut_cuda_;
//uint32_t* exp_mul_lut_cuda_;
//cudaTextureObject_t mant_mul_lut_text_;
//cudaTextureObject_t exp_mul_lut_text_;
//};
//template <typename Device>
//class approx_mul_lut : public approx_mul_lut_base {
//public:
//explicit approx_mul_lut(tensorflow::OpKernelConstruction *context);
//~approx_mul_lut();
//};
template <typename Device, typename T>
struct ConvamFunctor {
void operator()(const Device& d, const T* input_data, T* output_data,
const int batch, const int out_rows, const int out_cols,
const int out_depth, const int stride_cols, const int stride_rows,
const int filter_left_offset, const int filter_top_offset,
const int filter_rows, const int filter_cols, const int in_depth,
const int input_cols, const int input_rows, const T* filter,
T* im2col, const int padding,
approx_mul_lut<Device>& mul_lut
);
};
template <typename Device, typename T>
struct ConvamInputGradFunctor {
void operator()(const Device& d, const T* grad, T* im2col,
const int hole_grad_width, const int hole_grad_height,
const int pad_top, const int pad_left, const T* filter, T* rsfilter,
const int filter_rows, const int filter_cols, const int out_depth,
const int stride_rows, const int stride_cols, const int batch,
const int input_rows, const int input_cols, const int in_depth,
T* output, const int out_rows, const int out_cols,
approx_mul_lut<Device>& mul_lut
);
};
template <typename Device, typename T>
struct ConvamFilterGradFunctor{
void operator()(const Device& d, const T* input, const T* grad, T* im2col,
const int input_rows, const int input_cols, const int batch,
const int in_depth, const int out_cols, const int out_rows,
const int out_depth, const int filter_left_offset,
const int filter_top_offset, const int stride_rows,
const int stride_cols, const int filter_cols, const int filter_rows,
T* output, approx_mul_lut<Device>& mul_lut
);
};
#ifdef GOOGLE_CUDA
// Partially specialize functor for GpuDevice.
template <typename T>
struct ConvamFunctor<Eigen::GpuDevice, T> {
void operator()(const Eigen::GpuDevice& d, const T* input_data,
T* output_data, const int batch, const int out_rows, const int out_cols,
const int out_depth, const int stride_cols, const int stride_rows,
const int filter_left_offset, const int filter_top_offset,
const int filter_rows, const int filter_cols, const int in_depth,
const int input_cols, const int input_rows, const T* filter,
T* im2col, const int padding,
approx_mul_lut<Eigen::GpuDevice>& mul_lut
);
};
template <typename T>
struct ConvamInputGradFunctor<Eigen::GpuDevice, T> {
void operator()(const Eigen::GpuDevice& d, const T* grad, T* im2col,
const int hole_grad_width, const int hole_grad_height,
const int pad_top, const int pad_left, const T* filter, T* rsfilter,
const int filter_rows, const int filter_cols, const int out_depth,
const int stride_rows, const int stride_cols, const int batch,
const int input_rows, const int input_cols, const int in_depth,
T* output, const int out_rows, const int out_cols,
approx_mul_lut<Eigen::GpuDevice>& mul_lut
);
};
template <typename T>
struct ConvamFilterGradFunctor<Eigen::GpuDevice, T>{
void operator()(const Eigen::GpuDevice& d, const T* input, const T* grad,
T* im2col, const int input_rows, const int input_cols,
const int batch, const int in_depth, const int out_cols,
const int out_rows,const int out_depth, const int filter_left_offset,
const int filter_top_offset, const int stride_rows,
const int stride_cols, const int filter_cols, const int filter_rows,
T* output, approx_mul_lut<Eigen::GpuDevice>& mul_lut
);
};
#endif
#endif