GNU Radio Manual and C++ API Reference  3.7.7
The Free & Open Software Radio Ecosystem
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
volk_32f_tan_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 /*!
24  * \page volk_32f_tan_32f
25  *
26  * \b Overview
27  *
28  * Computes the tangent of each element of the aVector.
29  *
30  * b[i] = tan(a[i])
31  *
32  * <b>Dispatcher Prototype</b>
33  * \code
34  * void volk_32f_tan_32f(float* bVector, const float* aVector, unsigned int num_points)
35  * \endcode
36  *
37  * \b Inputs
38  * \li aVector: The buffer of points.
39  * \li num_points: The number of values in input buffer.
40  *
41  * \b Outputs
42  * \li bVector: The output buffer.
43  *
44  * \b Example
45  * Calculate tan(theta) for common angles.
46  * \code
47  * int N = 10;
48  * unsigned int alignment = volk_get_alignment();
49  * float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
50  * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
51  *
52  * in[0] = 0.000;
53  * in[1] = 0.524;
54  * in[2] = 0.785;
55  * in[3] = 1.047;
56  * in[4] = 1.571 ;
57  * in[5] = 1.571 ;
58  * in[6] = -1.047;
59  * in[7] = -0.785;
60  * in[8] = -0.524;
61  * in[9] = -0.000;
62  *
63  * volk_32f_tan_32f(out, in, N);
64  *
65  * for(unsigned int ii = 0; ii < N; ++ii){
66  * printf("tan(%1.3f) = %1.3f\n", in[ii], out[ii]);
67  * }
68  *
69  * volk_free(in);
70  * volk_free(out);
71  * \endcode
72  */
73 
74 #include <stdio.h>
75 #include <math.h>
76 #include <inttypes.h>
77 
78 #ifndef INCLUDED_volk_32f_tan_32f_a_H
79 #define INCLUDED_volk_32f_tan_32f_a_H
80 
81 #ifdef LV_HAVE_SSE4_1
82 #include <smmintrin.h>
83 
84 static inline void
85 volk_32f_tan_32f_a_sse4_1(float* bVector, const float* aVector,
86  unsigned int num_points)
87 {
88  float* bPtr = bVector;
89  const float* aPtr = aVector;
90 
91  unsigned int number = 0;
92  unsigned int quarterPoints = num_points / 4;
93  unsigned int i = 0;
94 
95  __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
96  __m128 sine, cosine, tangent, condition1, condition2, condition3;
97  __m128i q, r, ones, twos, fours;
98 
99  m4pi = _mm_set1_ps(1.273239545);
100  pio4A = _mm_set1_ps(0.78515625);
101  pio4B = _mm_set1_ps(0.241876e-3);
102  ffours = _mm_set1_ps(4.0);
103  ftwos = _mm_set1_ps(2.0);
104  fones = _mm_set1_ps(1.0);
105  fzeroes = _mm_setzero_ps();
106  ones = _mm_set1_epi32(1);
107  twos = _mm_set1_epi32(2);
108  fours = _mm_set1_epi32(4);
109 
110  cp1 = _mm_set1_ps(1.0);
111  cp2 = _mm_set1_ps(0.83333333e-1);
112  cp3 = _mm_set1_ps(0.2777778e-2);
113  cp4 = _mm_set1_ps(0.49603e-4);
114  cp5 = _mm_set1_ps(0.551e-6);
115 
116  for(;number < quarterPoints; number++){
117  aVal = _mm_load_ps(aPtr);
118  s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
119  q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
120  r = _mm_add_epi32(q, _mm_and_si128(q, ones));
121 
122  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
123  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
124 
125  s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
126  s = _mm_mul_ps(s, s);
127  // Evaluate Taylor series
128  s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
129 
130  for(i = 0; i < 3; i++){
131  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
132  }
133  s = _mm_div_ps(s, ftwos);
134 
135  sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
136  cosine = _mm_sub_ps(fones, s);
137 
138  condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
139  condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes));
140  condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
141 
142  __m128 temp = cosine;
143  cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
144  sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
145  sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
146  cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
147  tangent = _mm_div_ps(sine, cosine);
148  _mm_store_ps(bPtr, tangent);
149  aPtr += 4;
150  bPtr += 4;
151  }
152 
153  number = quarterPoints * 4;
154  for(;number < num_points; number++){
155  *bPtr++ = tan(*aPtr++);
156  }
157 }
158 
159 #endif /* LV_HAVE_SSE4_1 for aligned */
160 
161 
162 #endif /* INCLUDED_volk_32f_tan_32f_a_H */
163 
164 #ifndef INCLUDED_volk_32f_tan_32f_u_H
165 #define INCLUDED_volk_32f_tan_32f_u_H
166 
167 #ifdef LV_HAVE_SSE4_1
168 #include <smmintrin.h>
169 
170 static inline void
171 volk_32f_tan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
172 {
173  float* bPtr = bVector;
174  const float* aPtr = aVector;
175 
176  unsigned int number = 0;
177  unsigned int quarterPoints = num_points / 4;
178  unsigned int i = 0;
179 
180  __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
181  __m128 sine, cosine, tangent, condition1, condition2, condition3;
182  __m128i q, r, ones, twos, fours;
183 
184  m4pi = _mm_set1_ps(1.273239545);
185  pio4A = _mm_set1_ps(0.78515625);
186  pio4B = _mm_set1_ps(0.241876e-3);
187  ffours = _mm_set1_ps(4.0);
188  ftwos = _mm_set1_ps(2.0);
189  fones = _mm_set1_ps(1.0);
190  fzeroes = _mm_setzero_ps();
191  ones = _mm_set1_epi32(1);
192  twos = _mm_set1_epi32(2);
193  fours = _mm_set1_epi32(4);
194 
195  cp1 = _mm_set1_ps(1.0);
196  cp2 = _mm_set1_ps(0.83333333e-1);
197  cp3 = _mm_set1_ps(0.2777778e-2);
198  cp4 = _mm_set1_ps(0.49603e-4);
199  cp5 = _mm_set1_ps(0.551e-6);
200 
201  for(;number < quarterPoints; number++){
202  aVal = _mm_loadu_ps(aPtr);
203  s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
204  q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
205  r = _mm_add_epi32(q, _mm_and_si128(q, ones));
206 
207  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
208  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
209 
210  s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
211  s = _mm_mul_ps(s, s);
212  // Evaluate Taylor series
213  s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
214 
215  for(i = 0; i < 3; i++){
216  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
217  }
218  s = _mm_div_ps(s, ftwos);
219 
220  sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
221  cosine = _mm_sub_ps(fones, s);
222 
223  condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
224  condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes));
225  condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
226 
227  __m128 temp = cosine;
228  cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
229  sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
230  sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
231  cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
232  tangent = _mm_div_ps(sine, cosine);
233  _mm_storeu_ps(bPtr, tangent);
234  aPtr += 4;
235  bPtr += 4;
236  }
237 
238  number = quarterPoints * 4;
239  for(;number < num_points; number++){
240  *bPtr++ = tan(*aPtr++);
241  }
242 }
243 
244 #endif /* LV_HAVE_SSE4_1 for unaligned */
245 
246 
247 #ifdef LV_HAVE_GENERIC
248 
249 static inline void
250 volk_32f_tan_32f_generic(float* bVector, const float* aVector,
251  unsigned int num_points)
252 {
253  float* bPtr = bVector;
254  const float* aPtr = aVector;
255  unsigned int number = 0;
256 
257  for(; number < num_points; number++){
258  *bPtr++ = tan(*aPtr++);
259  }
260 }
261 #endif /* LV_HAVE_GENERIC */
262 
263 
264 #endif /* INCLUDED_volk_32f_tan_32f_u_H */