diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..e501407 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,34 @@ +cmake_minimum_required(VERSION 3.0) + +project(cis565_parallel_fft) + +set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH}) + +# Enable C++11 for host code +set(CMAKE_CXX_STANDARD 11) + +list(APPEND CUDA_NVCC_FLAGS_DEBUG -G -g) +list(APPEND CUDA_NVCC_FLAGS_RELWITHDEBUGINFO -lineinfo) + +# Crucial magic for CUDA linking +find_package(Threads REQUIRED) +find_package(CUDA REQUIRED) + +set(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE ON) +set(CUDA_SEPARABLE_COMPILATION ON) + +if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") + set(CUDA_PROPAGATE_HOST_FLAGS OFF) +endif() + +include_directories(.) +add_subdirectory(parallel_fft) + +cuda_add_executable(${CMAKE_PROJECT_NAME} + "src/main.cpp" + ) + +target_link_libraries(${CMAKE_PROJECT_NAME} + parallel_fft + ${CORELIBS} + ) diff --git a/INSTRUCTION.md b/INSTRUCTION.md deleted file mode 100644 index bf01a90..0000000 --- a/INSTRUCTION.md +++ /dev/null @@ -1,431 +0,0 @@ -Proj 5 WebGL Deferred Shader - Instructions -======================== - -This is due at midnight on the midnight of **Tue Nov 8 2016**. - -**Summary:** In this project, you'll be introduced to the basics of deferred -shading and WebGL. You'll use GLSL and WebGL to implement a deferred shading -pipeline and various lighting and visual effects. - -**Recommendations:** -Take screenshots as you go. Use them to document your progress in your README! - -Read (or at least skim) the full README before you begin, so that you know what -to expect and what to prepare for. - -### Running the code - -If you have Python, you should be able to run `server.py` to start a server. -Then, open [`http://localhost:10565/`](http://localhost:10565/) in your browser. - -This project requires a WebGL-capable web browser with support for -`WEBGL_draw_buffers`. You can check for support on -[WebGL Report](http://webglreport.com/). - -Google Chrome seems to work best on all platforms. If you have problems running -the starter code, use Chrome or Chromium, and make sure you have updated your -browser and video drivers. Firefox's shader editor may require that you disable -WebGL debugging in `framework.js` (see below). - -Use the screenshot button to save a screenshot. - -## Requirements - -**Ask on the mailing list for any clarifications.** - -In this project, you are given code for: - -* Loading glTF models -* Camera control -* Partial implementation of deferred shading including many helper functions - -### Required Tasks - -**Before doing performance analysis,** you must disable debug mode by changing -`debugMode` to `false` in `framework.js`. Keep it enabled when developing - it -helps find WebGL errors *much* more easily. - -You will need to perform the following tasks: - -* Complete the deferred shading pipeline so that the Blinn-Phong and Post1 - shaders recieve the correct input. Go through the Starter Code Tour **before - continuing!** - -**Effects:** - -* Implement deferred Blinn-Phong shading (diffuse + specular) for point lights - * With normal mapping (code provided) - * For deferred shading, you want to use a lighting model for the point lights - which has a limited radius - so that adding a scissor or proxy geometry - will not cause parts of the lighting to disappear. It should look very - similar both with and without scissor/proxy optimization. Here is a - convenient lighting model, but you can also use others: - * `float attenuation = max(0.0, u_lightRad - dist_from_surface_to_light);` - -* Implement one of the following effects: - * Bloom using post-process blur (box or Gaussian) [1] - * Toon shading (with ramp shading + simple depth-edge detection for outlines) - -**Optimizations:** - -* Scissor test optimization: when accumulating shading from each point - light source, only render in a rectangle around the light. - * Show a debug view for this (showing scissor masks clearly), e.g. by - modifying and using `red.frag.glsl` with additive blending and alpha = 0.1. - * Code is provided to compute this rectangle for you, and there are - comments at the relevant place in `deferredRender.js` with more guidance. - * **NOTE:** The provided scissor function is not very accurate - it is a - quick hack which results in some errors (as can be seen in the live - demo). - -* Optimized g-buffer format - reduce the number and size of g-buffers: - * Ideas: - * Pack values together into vec4s - * Use 2-component normals - * Quantize values by using smaller texture types instead of gl.FLOAT - * Reduce number of properties passed via g-buffer, e.g. by: - * Applying the normal map in the `copy` shader pass instead of - copying both geometry normals and normal maps - * Reconstructing world space position using camera matrices and X/Y/depth - * For credit, you must show a good optimization effort and record the - performance of each version you test, in a simple table. - * It is expected that you won't need all 4 provided g-buffers for a basic - pipeline - make sure you disable the unused ones. - * See mainly: `copy.frag.glsl`, `deferred/*.glsl`, `deferredSetup.js` - -### Extra Tasks - -You must do at least **4 points** worth of extra features (effects or -optimizations/analysis). - -**Effects:** - -* (3pts) The effect you didn't choose above (bloom or toon shading) - -* (3pts) Screen-space motion blur (blur along velocity direction) [3] - -* (2pts) Allow variability in additional material properties - * Include other properties (e.g. specular coeff/exponent) in g-buffers - * Use this to render objects with different material properties - * These may be uniform across one model draw call, but you'll have to show - multiple models - -**Optimizations/Analysis:** - -* (2pts) Improved screen-space AABB for scissor test - (smaller/more accurate than provided - but beware of CPU/GPU tradeoffs) - -* (3pts) Two-pass **Gaussian** blur using separable convolution (using a second - postprocess render pass) to improve bloom or other 2D blur performance - -* (4pts) Light proxies - * (4pts) Instead of rendering a scissored full-screen quad for every light, - render some proxy geometry which covers the part of the screen affected by - the light (e.g. a sphere, for an attenuated point light). - * A model called `sphereModel` is provided which can be drawn in the same - way as the code in `drawScene`. (Must be drawn with a vertex shader which - scales it to the light radius and translates it to the light position.) - * (+ extra 2pts) To avoid lighting geometry far behind the light, render the proxy - geometry (e.g. sphere) using an inverted depth test - (`gl.depthFunc(gl.GREATER)`) with depth writing disabled (`gl.depthMask`). - This test will pass only for parts of the screen for which the backside of - the sphere appears behind parts of the scene. - * Note that the copy pass's depth buffer must be bound to the FBO during - this operation! - * Show a debug view for this (showing light proxies) - * Compare performance of this, naive, and scissoring. - -* (6pts) Tile-based deferred shading with detailed performance comparison - * On the CPU, check which lights overlap which tiles. Then, render each tile - just once for all lights (instead of once for each light), applying only - the overlapping lights. - * The method is described very well in - [Yuqin & Sijie's README](https://github.com/YuqinShao/Tile_Based_WebGL_DeferredShader/blob/master/README.md#algorithm-details). - * This feature requires allocating the global light list and tile light - index lists as shown at this link. These can be implemented as textures. - * Show a debug view for this (number of lights per tile) - * (+ extra 4pts) Do the light tile overlapping test on GPU (using shader). - This might be a little hacky, the idea is similar to - the light culling stage that you need to implemnt in [Project5A-Forward-Plus-Renderer](https://github.com/CIS565-Fall-2016/Project5A-WebGL-Forward-Plus-Shading-with-glTF). - Show performance comparison to deferred and tile-based deferred with light tile test on CPU. - -* (3pts) Compare performance to equivalently-lit forward-rendering and forward-plus-rendering: - * You can pair with a classmate choosing to do forward-plus-renderer. - -This extra feature list is not comprehensive. If you have a particular idea -that you would like to implement, please **contact us first** (preferably on -the mailing list). - -**Where possible, all features should be switchable using the GUI panel in -`ui.js`.** - -### Performance & Analysis - -**Before doing performance analysis,** you must disable debug mode by changing -`debugMode` to `false` in `framework.js`. Keep it enabled when developing - it -helps find WebGL errors *much* more easily. - -Optimize your JavaScript and/or GLSL code. Chrome/Firefox's profiling tools -(see Resources section) will be useful for this. For each change -that improves performance, show the before and after render times. - -For each new *effect* feature (required or extra), please -provide the following analysis: - -* Concise overview write-up of the feature. -* Performance change due to adding the feature. - * If applicable, how do parameters (such as number of lights, etc.) - affect performance? Show data with simple graphs. - * Show timing in milliseconds, not FPS. -* If you did something to accelerate the feature, what did you do and why? -* How might this feature be optimized beyond your current implementation? - -For each *performance* feature (required or extra), please provide: - -* Concise overview write-up of the feature. -* Detailed performance improvement analysis of adding the feature - * What is the best case scenario for your performance improvement? What is - the worst? Explain briefly. - * Are there tradeoffs to this performance feature? Explain briefly. - * How do parameters (such as number of lights, tile size, etc.) affect - performance? Show data with graphs. - * Show timing in milliseconds, not FPS. - * Show debug views when possible. - * If the debug view correlates with performance, explain how. - -### Starter Code Tour - -You'll be working mainly in `deferredRender.js` and shaders in `glsl/` folder using raw WebGL. -Three.js is included in the project for various reasons. You won't use it for much, but its -matrix/vector types may come in handy. You don't need to learn Three.js for this project. - -For editing JavaScript, you can use a simple editor with syntax highlighting -such as Atom, VS-Code, Sublime, Vim, Emacs, etc., or the editor built into Chrome. - -It's highly recommended that you use the browser debugger (F12 on Windows) to inspect variables -to get familiar with the code. At any point, you can also -`console.log(some_var);` to show it in the console and inspect it. - -The setup in `deferredSetup` is already done for you, for many of the features. -If you want to add uniforms (textures or values), you'll change them here. -Therefore, it is recommended that you review the comments to understand the -process, BEFORE starting work in `deferredRender`. - -Test incrementally (after implementing each part, instead of testing -all at once). - -Here's a guide to get you started: - -* Your first goal is to render a red fullscreen quad. - - `js/deferredRender.js`: walk through the code and implment by uncommenting those labeled with `TODO: uncomment` - - implement function `renderFullScreenQuad` - -* Your _next_ first goal should be to get the debug views working. -Add code in `debug.frag.glsl` to examine your g-buffers before trying to -render them. (Set the debugView in the UI to show them.) - - `js/deferredRender.js` - - implement `R.pass_copy.render` - - implement `R.pass_debug.render` - - `glsl/copy.frag.glsl` - - `glsl/deferred/debug.frag.glsl` - -* At this point you should have some understanding of how WebGL works. -Those bind buffer, bind texture, vertex pointer is copying data from cpu to gpu and tell -the gpu how to access them. It is very similar to what you've done in your Proj4 rasterizer. -glsl Shaders are code running on gpu. You can treat them as cuda kernel functions in your Proj4. - -* Now go ahead and implement the deferred shading - - `js/deferredRender.js`: finish implementing this file. This time you need to write some code. - - `glsl/deferred/ambient.frag.glsl` - - `glsl/deferred/blinnphong-pointlight.frag.glsl` - - everything... you are on your own now. - - -Full files guidance: - -* `js/`: JavaScript files for this project. - * `main.js`: Handles initialization of other parts of the program. - * `framework.js`: Loads the scene, camera, etc., and calls your setup/render - functions. Hopefully, you won't need to change anything here. - * `deferredSetup.js`: Deferred shading pipeline setup code. - * `createAndBind(Depth/Color)TargetTexture`: Creates empty textures for - binding to frame buffer objects as render targets. - * `deferredRender.js`: Your deferred shading pipeline execution code. - * `renderFullScreenQuad`: Renders a full-screen quad with the given shader - program. - * `ui.js`: Defines the UI using - [dat.GUI](https://workshop.chromeexperiments.com/examples/gui/). - * The global variable `cfg` can be accessed anywhere in the code to read - configuration values. - * `utils.js`: Utilities for JavaScript and WebGL. - * `abort`: Aborts the program and shows an error. - * `loadTexture`: Loads a texture from a URL into WebGL. - * `loadShaderProgram`: Loads shaders from URLs into a WebGL shader program. - * `loadModel`: Loads a model into WebGL buffers. - * `readyModelForDraw`: Configures the WebGL state to draw a model. - * `drawReadyModel`: Draws a model which has been readied. - * `getScissorForLight`: Computes an approximate scissor rectangle for a - light in world space. -* `glsl/`: GLSL code for each part of the pipeline: - * `clear.*.glsl`: Clears each of the `NUM_GBUFFERS` g-buffers. - * `copy.*.glsl`: Performs standard rendering without any fragment shading, - storing all of the resulting values into the `NUM_GBUFFERS` g-buffers. - * `quad.vert.glsl`: Minimal vertex shader for rendering a single quad. - * `deferred.frag.glsl`: Deferred shading pass (for lighting calculations). - Reads from each of the `NUM_GBUFFERS` g-buffers. - * `post1.frag.glsl`: First post-processing pass. -* `lib/`: JavaScript libraries. -* `models/`: glTF models for testing. Sponza is the default (And the only tested supported one for now. Due to time limitation T_T. ) -* `index.html`: Main HTML page. -* `server.bat` (Windows) or `server.py` (OS X/Linux): - Runs a web server at `localhost:10565`. - -### The Deferred Shading Pipeline - -See the comments in `deferredSetup.js`/`deferredRender.js` for low-level guidance. - -In order to enable and disable effects using the GUI, upload a vec4 uniform -where each component is an enable/disable flag. In JavaScript, the state of the -UI is accessible anywhere as `cfg.enableEffect0`, etc. - -**Pass 1:** Renders the scene geometry and its properties to the g-buffers. -* `copy.vert.glsl`, `copy.frag.glsl` -* The framebuffer object `pass_copy.fbo` must be bound during this pass. -* Renders into `pass_copy.depthTex` and `pass_copy.gbufs[i]`, which need to be - attached to the framebuffer. - -**Pass 2:** Performs lighting and shading into the color buffer. -* `quad.vert.glsl`, `deferred/blinnphong-pointlight.frag.glsl` -* Takes the g-buffers `pass_copy.gbufs`/`depthTex` as texture inputs to the - fragment shader, on uniforms `u_gbufs` and `u_depth`. -* `pass_deferred.fbo` must be bound. -* Renders into `pass_deferred.colorTex`. - -**Pass 3:** Performs post-processing. -* `quad.vert.glsl`, `post/one.frag.glsl` -* Takes `pass_BlinnPhong_PointLight.colorTex` as a texture input `u_color`. -* Renders directly to the screen if there are no additional passes. - -More passes may be added for additional effects (e.g. combining bloom with -motion blur) or optimizations (e.g. two-pass Gaussian blur for bloom) - -#### Debugging - -If there is a WebGL error, it will be displayed on the developer console and -the renderer will be aborted. To find out where the error came from, look at -the backtrace of the error (you may need to click the triangle to expand the -message). The line right below `wrapper @ webgl-debug.js` will point to the -WebGL call that failed. - -When working in the early pipeline (before you have a lit render), it can be -useful to render WITHOUT post-processing. To do this, you have to make sure -that there is NO framebuffer bound while rendering to the screen (that is, bind -null) so that the output will display to the screen instead of saving into a -texture. Writing to gl_FragData[0] is the same as writing to gl_FragColor, so -you'll see whatever you were storing into the first g-buffer. - -#### Changing the number of g-buffers - -Note that the g-buffers are just `vec4`s - you can put any values you want into -them. However, if you want to change the total number of g-buffers (add more -for additional effects or remove some for performance), you will need to make -changes in a number of places: - -* `deferredSetup.js`/`deferredRender.js`: search for `NUM_GBUFFERS` -* `copy.frag.glsl` -* `deferred.frag.glsl` -* `clear.frag.glsl` - - -## Resources - -* [1] Bloom: - [GPU Gems, Ch. 21](http://http.developer.nvidia.com/GPUGems/gpugems_ch21.html) -* [2] Screen-Space Ambient Occlusion: - [Floored Article](http://www.floored.com/blog/2015ssao-screen-space-ambient-occlusion/) -* [3] Post-Process Motion Blur: - [GPU Gems 3, Ch. 27](http://http.developer.nvidia.com/GPUGems3/gpugems3_ch27.html) - -**Also see:** The articles linked in the course schedule. - -### Profiling and debugging tools - -Built into Firefox: -* Canvas inspector -* Shader Editor -* JavaScript debugger and profiler - -Built into Chrome: -* JavaScript debugger and profiler - -Plug-ins: -* Web Tracing Framework - **Does not currently work with multiple render targets**, - which are used in the starter code. -* (Chrome) [Shader Editor](https://chrome.google.com/webstore/detail/shader-editor/ggeaidddejpbakgafapihjbgdlbbbpob) - -Libraries: -* Stats.js (already included) - -Firefox can also be useful - it has a canvas inspector, WebGL profiling and a -shader editor built in. - -## README - -Replace the contents of the README.md in a clear manner with the following: - -* A brief description of the project and the specific features you implemented. -* At least one screenshot of your project running. -* A 30+ second video/gif of your project running showing all features. - (Even though your demo can be seen online, using multiple render targets - means it won't run on many computers. A video will work everywhere.) -* A performance analysis (described below). - -### Performance Analysis - -See above. - -### GitHub Pages - -Since this assignment is in WebGL, you can make your project easily viewable by -taking advantage of GitHub's project pages feature. - -Once you are done with the assignment, create a new branch: - -`git branch gh-pages` - -Push the branch to GitHub: - -`git push origin gh-pages` - -Now, you can go to `.github.io/` to see your -renderer online from anywhere. Add this link to your README. - -## Submit - -Beware of any build issues discussed on the Google Group. - -Open a GitHub pull request so that we can see that you have finished. -The title should be "Project 5B: YOUR NAME". -The template of the comment section of your pull request is attached below, you can do some copy and paste: - -* [Repo Link](https://link-to-your-repo) -* `Your PENNKEY` -* (Briefly) Mentions features that you've completed. Especially those bells and whistles you want to highlight - * Feature 0 - * Feature 1 - * ... -* Feedback on the project itself, if any. - -### Third-Party Code Policy - -* Use of any third-party code must be approved by asking on our mailing list. -* If it is approved, all students are welcome to use it. Generally, we approve - use of third-party code that is not a core part of the project. For example, - for the path tracer, we would approve using a third-party library for loading - models, but would not approve copying and pasting a CUDA function for doing - refraction. -* Third-party code **MUST** be credited in README.md. -* Using third-party code without its approval, including using another - student's code, is an academic integrity violation, and will, at minimum, - result in you receiving an F for the semester. diff --git a/README.md b/README.md index 25002db..a8065fe 100644 --- a/README.md +++ b/README.md @@ -1,33 +1,117 @@ -WebGL Deferred Shading +Parallel Fast Fourier Transform ====================== **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 5** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) **Google Chrome 222.2** on - Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Gabriel Naghi +* Tested on: + - CPU implementation: Linux OpenSUSE, Intel Xeon E5-2470 @ 2.4 GHz, 32 GB RAM (Eniac) + - GPU Implementation: Windows 10, Intel Core i7-2600 @ 3.4 GHz, 8 GB RAM, GeForce GT 730 1024 MB (DSL) -### Live Online +##Fourier Transforms +Fourier Transforms define a process by which to trasnform a signal from the time domain to the frequency ("forward transform") and vice versa ("inverse transform"). Fourier Transforms rely on the principle that any signal can be represented as magnitude and phase as a function of frequency. -[![](img/thumb.png)](http://TODO.github.io/Project5B-WebGL-Deferred-Shading) +![](img/Fourier_unit_pulse.png) +Source: Wikipedia -### Demo Video/GIF +Representing a signal in the freqency domain is useful for a few reasons. First and foremost, it tells you what frequencies are present in the signal, and in what proportions. Another important use is that a multiplication in frequency domain is equivalent to a convolution in the time domain. It is generally easier to transform and multiply than compute a convolution. A similar optimization exists with regard to cross-correlations. It is also much easier to compute the n-th derivative of a function in the frequency domain than in the time domain. There are many other uses of Fourier Transforms (see discussion [here](http://dsp.stackexchange.com/questions/69/why-is-the-fourier-transform-so-important)). -[![](img/video.png)](TODO) +##Discrete Fourier Transforms -### (TODO: Your README) +In practice, Discrete Fourier Transforms (DFT) are used. This means that samples are of finite quantity and are equally spaced over time. The transform occurs by correlating each sample with with analyzing functions in the form of sinusoids. Of course, this produces high coefficients when the sample is similar and low amplitudes when dissimilar. -*DO NOT* leave the README to the last minute! It is a crucial part of the -project, and we will not be able to grade you without a good README. +In general the formula for computing a given frequency bucket in a DFT is as follows: -This assignment has a considerable amount of performance analysis compared -to implementation work. Complete the implementation early to leave time! +![](img/dft.png) +Source: http://www.cmlab.csie.ntu.edu.tw/cml/dsp/training/coding/transform/fft.html +This results in an O(N^2) algorithm. We can do much better. -### Credits -* [Three.js](https://github.com/mrdoob/three.js) by [@mrdoob](https://github.com/mrdoob) and contributors -* [stats.js](https://github.com/mrdoob/stats.js) by [@mrdoob](https://github.com/mrdoob) and contributors -* [webgl-debug](https://github.com/KhronosGroup/WebGLDeveloperTools) by Khronos Group Inc. -* [glMatrix](https://github.com/toji/gl-matrix) by [@toji](https://github.com/toji) and contributors -* [minimal-gltf-loader](https://github.com/shrekshao/minimal-gltf-loader) by [@shrekshao](https://github.com/shrekshao) +##Fast Fourier Transforms +Originally discovered by Carl Friedrich Gauss, and re-popularized by J.W. Cooley and John Tukey in the 20th century, the Fast Fourier Transform exploits two properties to reduce the number of elemetary operations: + +![](img/properties.png) +Source: http://www.cmlab.csie.ntu.edu.tw/cml/dsp/training/coding/transform/fft.html + +In short, this allows us to recursively break up DFTs into N/2 point problems in a divide-and-conquer strategy, and recombine the sub-pieces. + +From Wikipedia, the Cooley Tukey FFT pseduocode is as follows: + +~~~ +X0,...,N−1 ← ditfft2(x, N, s): DFT of (x0, xs, x2s, ..., x(N-1)s): + if N = 1 then + X0 ← x0 trivial size-1 DFT base case + else + X0,...,N/2−1 ← ditfft2(x, N/2, 2s) DFT of (x0, x2s, x4s, ...) + XN/2,...,N−1 ← ditfft2(x+s, N/2, 2s) DFT of (xs, xs+2s, xs+4s, ...) + for k = 0 to N/2−1 combine DFTs of two halves into full DFT: + t ← Xk + Xk ← t + exp(−2πi k/N) Xk+N/2 + Xk+N/2 ← t − exp(−2πi k/N) Xk+N/2 + endfor + endif +~~~ + +###Parallel Fast Fourier Transform Algorithm + +My parallel implementation of the fast fourier transform was divided into three primary stages: + +1. Input reorganization +2. Twiddle factor multiplication +3. Butterfly + +The first stage simply consists of reorganizing the inputs to such an order that the outputs will be in their respective correct buckets. As it happens, the output order is just the reverse-bit order of the inputs. Bit reversal is a harder problem than one might initially realize, but fortunately Stanford hosts a page about just this (see [here](http://graphics.stanford.edu/~seander/bithacks.html)). I used the "Reverse an N bit quantity in parallel n 5 * lg(N) operations" method. + +After the inputs have been reorganized, the sub-DFTs must be computed. Starting at the base case of N=2, the upper half indices are multiplied by their proper twiddle factor. Then the N values add or subract with the element N/2 away. This forms the famous "butterfly" patten, depicted below in an image from Wikipedia. + +![](img/butterfly.png) + +These operations are conducted lgN times, each time involving O(N) complex additions/subtractions, as depicted in the flow diagram below. + +![](img/correctbutterfly.png) +Source: Scientific Research Publishing + +The time complexity of this algorithm is thus O(NlgN). However, there is plenty of embarrassing parallelism, and thus a parallel implementation should really speed things up over a serial implementation. + +##Performance Analysis + +My first optimization, as always was to find the generally optimal blocksize for the working implementation. No particular intermidiate value performed particulary well, so I chose blocksize of 64 as my "optimal" blocksize, against which I compared the CPU implementation. + +![](img/blocksizes.png) + +Unfortunately for me (and everyone else hoping for an easy exploit in the embarassingly parallel department) FFTW, an acronym for Fastest Fourier Transform in the West, really lives up to its name. It completely blew away my parallel GPU implementation, even on very large inputs. + +![](img/implementations.png) + +To be fair to my unhappy little implementation, FFTW is a 100k + line monstrosity of finely tuned computation, and is generally considered the gold standard when it comes to fourier transforms. Some of the optimizations imbued in FFTW are: + +* Routines coded in Assembly +* SIMD Instructions +* Dynamic Programming techniques to select from multiple strategies for a given input and machine (including memory and cache) +* Hard Coded Unrolled FFTs for small sizes + + +##Future Work +There is a lot of room for improvement in the FFT implementation I've done. Among these are: + +* Vectorization +* Shared Memory usage +* Generalization to non radix-2 + + +##Bloopers + +I spent a gratuitous amount of time trying to decode GPU Gems 2's description of the algorithm, especially with regard to the Twiddle factor. + +![](img/gpugems.png) +Source: NVIDIA GPU Gems 2 + +I could not figure out for the life of me what the relationship was between the stage/index and the exponent of the presumably global Nth root of unity. Fortunately, I eventually stumbled upon this graph, which depicts the proper proceedure and generally makes sense vis a vis the actual Cooley Tukey algorithm. + +![](img/correctbutterfly.png) +Source: Scientific Research Publishing + +###Sources +* GPU GEMS 2 +* [YouTube](https://www.youtube.com/watch?v=EsJGuI7e_ZQ) diff --git a/cmake/CMakeParseArguments.cmake b/cmake/CMakeParseArguments.cmake new file mode 100644 index 0000000..8553f38 --- /dev/null +++ b/cmake/CMakeParseArguments.cmake @@ -0,0 +1,161 @@ +#.rst: +# CMakeParseArguments +# ------------------- +# +# +# +# CMAKE_PARSE_ARGUMENTS( +# args...) +# +# CMAKE_PARSE_ARGUMENTS() is intended to be used in macros or functions +# for parsing the arguments given to that macro or function. It +# processes the arguments and defines a set of variables which hold the +# values of the respective options. +# +# The argument contains all options for the respective macro, +# i.e. keywords which can be used when calling the macro without any +# value following, like e.g. the OPTIONAL keyword of the install() +# command. +# +# The argument contains all keywords for this macro +# which are followed by one value, like e.g. DESTINATION keyword of the +# install() command. +# +# The argument contains all keywords for this +# macro which can be followed by more than one value, like e.g. the +# TARGETS or FILES keywords of the install() command. +# +# When done, CMAKE_PARSE_ARGUMENTS() will have defined for each of the +# keywords listed in , and +# a variable composed of the given +# followed by "_" and the name of the respective keyword. These +# variables will then hold the respective value from the argument list. +# For the keywords this will be TRUE or FALSE. +# +# All remaining arguments are collected in a variable +# _UNPARSED_ARGUMENTS, this can be checked afterwards to see +# whether your macro was called with unrecognized parameters. +# +# As an example here a my_install() macro, which takes similar arguments +# as the real install() command: +# +# :: +# +# function(MY_INSTALL) +# set(options OPTIONAL FAST) +# set(oneValueArgs DESTINATION RENAME) +# set(multiValueArgs TARGETS CONFIGURATIONS) +# cmake_parse_arguments(MY_INSTALL "${options}" "${oneValueArgs}" +# "${multiValueArgs}" ${ARGN} ) +# ... +# +# +# +# Assume my_install() has been called like this: +# +# :: +# +# my_install(TARGETS foo bar DESTINATION bin OPTIONAL blub) +# +# +# +# After the cmake_parse_arguments() call the macro will have set the +# following variables: +# +# :: +# +# MY_INSTALL_OPTIONAL = TRUE +# MY_INSTALL_FAST = FALSE (this option was not used when calling my_install() +# MY_INSTALL_DESTINATION = "bin" +# MY_INSTALL_RENAME = "" (was not used) +# MY_INSTALL_TARGETS = "foo;bar" +# MY_INSTALL_CONFIGURATIONS = "" (was not used) +# MY_INSTALL_UNPARSED_ARGUMENTS = "blub" (no value expected after "OPTIONAL" +# +# +# +# You can then continue and process these variables. +# +# Keywords terminate lists of values, e.g. if directly after a +# one_value_keyword another recognized keyword follows, this is +# interpreted as the beginning of the new option. E.g. +# my_install(TARGETS foo DESTINATION OPTIONAL) would result in +# MY_INSTALL_DESTINATION set to "OPTIONAL", but MY_INSTALL_DESTINATION +# would be empty and MY_INSTALL_OPTIONAL would be set to TRUE therefor. + +#============================================================================= +# Copyright 2010 Alexander Neundorf +# +# Distributed under the OSI-approved BSD License (the "License"); +# see accompanying file Copyright.txt for details. +# +# This software is distributed WITHOUT ANY WARRANTY; without even the +# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the License for more information. +#============================================================================= +# (To distribute this file outside of CMake, substitute the full +# License text for the above reference.) + + +if(__CMAKE_PARSE_ARGUMENTS_INCLUDED) + return() +endif() +set(__CMAKE_PARSE_ARGUMENTS_INCLUDED TRUE) + + +function(CMAKE_PARSE_ARGUMENTS prefix _optionNames _singleArgNames _multiArgNames) + # first set all result variables to empty/FALSE + foreach(arg_name ${_singleArgNames} ${_multiArgNames}) + set(${prefix}_${arg_name}) + endforeach() + + foreach(option ${_optionNames}) + set(${prefix}_${option} FALSE) + endforeach() + + set(${prefix}_UNPARSED_ARGUMENTS) + + set(insideValues FALSE) + set(currentArgName) + + # now iterate over all arguments and fill the result variables + foreach(currentArg ${ARGN}) + list(FIND _optionNames "${currentArg}" optionIndex) # ... then this marks the end of the arguments belonging to this keyword + list(FIND _singleArgNames "${currentArg}" singleArgIndex) # ... then this marks the end of the arguments belonging to this keyword + list(FIND _multiArgNames "${currentArg}" multiArgIndex) # ... then this marks the end of the arguments belonging to this keyword + + if(${optionIndex} EQUAL -1 AND ${singleArgIndex} EQUAL -1 AND ${multiArgIndex} EQUAL -1) + if(insideValues) + if("${insideValues}" STREQUAL "SINGLE") + set(${prefix}_${currentArgName} ${currentArg}) + set(insideValues FALSE) + elseif("${insideValues}" STREQUAL "MULTI") + list(APPEND ${prefix}_${currentArgName} ${currentArg}) + endif() + else() + list(APPEND ${prefix}_UNPARSED_ARGUMENTS ${currentArg}) + endif() + else() + if(NOT ${optionIndex} EQUAL -1) + set(${prefix}_${currentArg} TRUE) + set(insideValues FALSE) + elseif(NOT ${singleArgIndex} EQUAL -1) + set(currentArgName ${currentArg}) + set(${prefix}_${currentArgName}) + set(insideValues "SINGLE") + elseif(NOT ${multiArgIndex} EQUAL -1) + set(currentArgName ${currentArg}) + set(${prefix}_${currentArgName}) + set(insideValues "MULTI") + endif() + endif() + + endforeach() + + # propagate the result variables to the caller: + foreach(arg_name ${_singleArgNames} ${_multiArgNames} ${_optionNames}) + set(${prefix}_${arg_name} ${${prefix}_${arg_name}} PARENT_SCOPE) + endforeach() + set(${prefix}_UNPARSED_ARGUMENTS ${${prefix}_UNPARSED_ARGUMENTS} PARENT_SCOPE) + +endfunction() diff --git a/cmake/FindCUDA.cmake b/cmake/FindCUDA.cmake new file mode 100644 index 0000000..f4b0783 --- /dev/null +++ b/cmake/FindCUDA.cmake @@ -0,0 +1,1806 @@ +#.rst: +# FindCUDA +# -------- +# +# Tools for building CUDA C files: libraries and build dependencies. +# +# This script locates the NVIDIA CUDA C tools. It should work on linux, +# windows, and mac and should be reasonably up to date with CUDA C +# releases. +# +# This script makes use of the standard find_package arguments of +# , REQUIRED and QUIET. CUDA_FOUND will report if an +# acceptable version of CUDA was found. +# +# The script will prompt the user to specify CUDA_TOOLKIT_ROOT_DIR if +# the prefix cannot be determined by the location of nvcc in the system +# path and REQUIRED is specified to find_package(). To use a different +# installed version of the toolkit set the environment variable +# CUDA_BIN_PATH before running cmake (e.g. +# CUDA_BIN_PATH=/usr/local/cuda1.0 instead of the default +# /usr/local/cuda) or set CUDA_TOOLKIT_ROOT_DIR after configuring. If +# you change the value of CUDA_TOOLKIT_ROOT_DIR, various components that +# depend on the path will be relocated. +# +# It might be necessary to set CUDA_TOOLKIT_ROOT_DIR manually on certain +# platforms, or to use a cuda runtime not installed in the default +# location. In newer versions of the toolkit the cuda library is +# included with the graphics driver- be sure that the driver version +# matches what is needed by the cuda runtime version. +# +# The following variables affect the behavior of the macros in the +# script (in alphebetical order). Note that any of these flags can be +# changed multiple times in the same directory before calling +# CUDA_ADD_EXECUTABLE, CUDA_ADD_LIBRARY, CUDA_COMPILE, CUDA_COMPILE_PTX, +# CUDA_COMPILE_FATBIN, CUDA_COMPILE_CUBIN or CUDA_WRAP_SRCS:: +# +# CUDA_64_BIT_DEVICE_CODE (Default matches host bit size) +# -- Set to ON to compile for 64 bit device code, OFF for 32 bit device code. +# Note that making this different from the host code when generating object +# or C files from CUDA code just won't work, because size_t gets defined by +# nvcc in the generated source. If you compile to PTX and then load the +# file yourself, you can mix bit sizes between device and host. +# +# CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE (Default ON) +# -- Set to ON if you want the custom build rule to be attached to the source +# file in Visual Studio. Turn OFF if you add the same cuda file to multiple +# targets. +# +# This allows the user to build the target from the CUDA file; however, bad +# things can happen if the CUDA source file is added to multiple targets. +# When performing parallel builds it is possible for the custom build +# command to be run more than once and in parallel causing cryptic build +# errors. VS runs the rules for every source file in the target, and a +# source can have only one rule no matter how many projects it is added to. +# When the rule is run from multiple targets race conditions can occur on +# the generated file. Eventually everything will get built, but if the user +# is unaware of this behavior, there may be confusion. It would be nice if +# this script could detect the reuse of source files across multiple targets +# and turn the option off for the user, but no good solution could be found. +# +# CUDA_BUILD_CUBIN (Default OFF) +# -- Set to ON to enable and extra compilation pass with the -cubin option in +# Device mode. The output is parsed and register, shared memory usage is +# printed during build. +# +# CUDA_BUILD_EMULATION (Default OFF for device mode) +# -- Set to ON for Emulation mode. -D_DEVICEEMU is defined for CUDA C files +# when CUDA_BUILD_EMULATION is TRUE. +# +# CUDA_GENERATED_OUTPUT_DIR (Default CMAKE_CURRENT_BINARY_DIR) +# -- Set to the path you wish to have the generated files placed. If it is +# blank output files will be placed in CMAKE_CURRENT_BINARY_DIR. +# Intermediate files will always be placed in +# CMAKE_CURRENT_BINARY_DIR/CMakeFiles. +# +# CUDA_HOST_COMPILATION_CPP (Default ON) +# -- Set to OFF for C compilation of host code. +# +# CUDA_HOST_COMPILER (Default CMAKE_C_COMPILER, $(VCInstallDir)/bin for VS) +# -- Set the host compiler to be used by nvcc. Ignored if -ccbin or +# --compiler-bindir is already present in the CUDA_NVCC_FLAGS or +# CUDA_NVCC_FLAGS_ variables. For Visual Studio targets +# $(VCInstallDir)/bin is a special value that expands out to the path when +# the command is run from withing VS. +# +# CUDA_NVCC_FLAGS +# CUDA_NVCC_FLAGS_ +# -- Additional NVCC command line arguments. NOTE: multiple arguments must be +# semi-colon delimited (e.g. --compiler-options;-Wall) +# +# CUDA_PROPAGATE_HOST_FLAGS (Default ON) +# -- Set to ON to propagate CMAKE_{C,CXX}_FLAGS and their configuration +# dependent counterparts (e.g. CMAKE_C_FLAGS_DEBUG) automatically to the +# host compiler through nvcc's -Xcompiler flag. This helps make the +# generated host code match the rest of the system better. Sometimes +# certain flags give nvcc problems, and this will help you turn the flag +# propagation off. This does not affect the flags supplied directly to nvcc +# via CUDA_NVCC_FLAGS or through the OPTION flags specified through +# CUDA_ADD_LIBRARY, CUDA_ADD_EXECUTABLE, or CUDA_WRAP_SRCS. Flags used for +# shared library compilation are not affected by this flag. +# +# CUDA_SEPARABLE_COMPILATION (Default OFF) +# -- If set this will enable separable compilation for all CUDA runtime object +# files. If used outside of CUDA_ADD_EXECUTABLE and CUDA_ADD_LIBRARY +# (e.g. calling CUDA_WRAP_SRCS directly), +# CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME and +# CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS should be called. +# +# CUDA_SOURCE_PROPERTY_FORMAT +# -- If this source file property is set, it can override the format specified +# to CUDA_WRAP_SRCS (OBJ, PTX, CUBIN, or FATBIN). If an input source file +# is not a .cu file, setting this file will cause it to be treated as a .cu +# file. See documentation for set_source_files_properties on how to set +# this property. +# +# CUDA_USE_STATIC_CUDA_RUNTIME (Default ON) +# -- When enabled the static version of the CUDA runtime library will be used +# in CUDA_LIBRARIES. If the version of CUDA configured doesn't support +# this option, then it will be silently disabled. +# +# CUDA_VERBOSE_BUILD (Default OFF) +# -- Set to ON to see all the commands used when building the CUDA file. When +# using a Makefile generator the value defaults to VERBOSE (run make +# VERBOSE=1 to see output), although setting CUDA_VERBOSE_BUILD to ON will +# always print the output. +# +# The script creates the following macros (in alphebetical order):: +# +# CUDA_ADD_CUFFT_TO_TARGET( cuda_target ) +# -- Adds the cufft library to the target (can be any target). Handles whether +# you are in emulation mode or not. +# +# CUDA_ADD_CUBLAS_TO_TARGET( cuda_target ) +# -- Adds the cublas library to the target (can be any target). Handles +# whether you are in emulation mode or not. +# +# CUDA_ADD_EXECUTABLE( cuda_target file0 file1 ... +# [WIN32] [MACOSX_BUNDLE] [EXCLUDE_FROM_ALL] [OPTIONS ...] ) +# -- Creates an executable "cuda_target" which is made up of the files +# specified. All of the non CUDA C files are compiled using the standard +# build rules specified by CMAKE and the cuda files are compiled to object +# files using nvcc and the host compiler. In addition CUDA_INCLUDE_DIRS is +# added automatically to include_directories(). Some standard CMake target +# calls can be used on the target after calling this macro +# (e.g. set_target_properties and target_link_libraries), but setting +# properties that adjust compilation flags will not affect code compiled by +# nvcc. Such flags should be modified before calling CUDA_ADD_EXECUTABLE, +# CUDA_ADD_LIBRARY or CUDA_WRAP_SRCS. +# +# CUDA_ADD_LIBRARY( cuda_target file0 file1 ... +# [STATIC | SHARED | MODULE] [EXCLUDE_FROM_ALL] [OPTIONS ...] ) +# -- Same as CUDA_ADD_EXECUTABLE except that a library is created. +# +# CUDA_BUILD_CLEAN_TARGET() +# -- Creates a convience target that deletes all the dependency files +# generated. You should make clean after running this target to ensure the +# dependency files get regenerated. +# +# CUDA_COMPILE( generated_files file0 file1 ... [STATIC | SHARED | MODULE] +# [OPTIONS ...] ) +# -- Returns a list of generated files from the input source files to be used +# with ADD_LIBRARY or ADD_EXECUTABLE. +# +# CUDA_COMPILE_PTX( generated_files file0 file1 ... [OPTIONS ...] ) +# -- Returns a list of PTX files generated from the input source files. +# +# CUDA_COMPILE_FATBIN( generated_files file0 file1 ... [OPTIONS ...] ) +# -- Returns a list of FATBIN files generated from the input source files. +# +# CUDA_COMPILE_CUBIN( generated_files file0 file1 ... [OPTIONS ...] ) +# -- Returns a list of CUBIN files generated from the input source files. +# +# CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME( output_file_var +# cuda_target +# object_files ) +# -- Compute the name of the intermediate link file used for separable +# compilation. This file name is typically passed into +# CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS. output_file_var is produced +# based on cuda_target the list of objects files that need separable +# compilation as specified by object_files. If the object_files list is +# empty, then output_file_var will be empty. This function is called +# automatically for CUDA_ADD_LIBRARY and CUDA_ADD_EXECUTABLE. Note that +# this is a function and not a macro. +# +# CUDA_INCLUDE_DIRECTORIES( path0 path1 ... ) +# -- Sets the directories that should be passed to nvcc +# (e.g. nvcc -Ipath0 -Ipath1 ... ). These paths usually contain other .cu +# files. +# +# +# +# CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS( output_file_var cuda_target +# nvcc_flags object_files) +# +# -- Generates the link object required by separable compilation from the given +# object files. This is called automatically for CUDA_ADD_EXECUTABLE and +# CUDA_ADD_LIBRARY, but can be called manually when using CUDA_WRAP_SRCS +# directly. When called from CUDA_ADD_LIBRARY or CUDA_ADD_EXECUTABLE the +# nvcc_flags passed in are the same as the flags passed in via the OPTIONS +# argument. The only nvcc flag added automatically is the bitness flag as +# specified by CUDA_64_BIT_DEVICE_CODE. Note that this is a function +# instead of a macro. +# +# CUDA_WRAP_SRCS ( cuda_target format generated_files file0 file1 ... +# [STATIC | SHARED | MODULE] [OPTIONS ...] ) +# -- This is where all the magic happens. CUDA_ADD_EXECUTABLE, +# CUDA_ADD_LIBRARY, CUDA_COMPILE, and CUDA_COMPILE_PTX all call this +# function under the hood. +# +# Given the list of files (file0 file1 ... fileN) this macro generates +# custom commands that generate either PTX or linkable objects (use "PTX" or +# "OBJ" for the format argument to switch). Files that don't end with .cu +# or have the HEADER_FILE_ONLY property are ignored. +# +# The arguments passed in after OPTIONS are extra command line options to +# give to nvcc. You can also specify per configuration options by +# specifying the name of the configuration followed by the options. General +# options must preceed configuration specific options. Not all +# configurations need to be specified, only the ones provided will be used. +# +# OPTIONS -DFLAG=2 "-DFLAG_OTHER=space in flag" +# DEBUG -g +# RELEASE --use_fast_math +# RELWITHDEBINFO --use_fast_math;-g +# MINSIZEREL --use_fast_math +# +# For certain configurations (namely VS generating object files with +# CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE set to ON), no generated file will +# be produced for the given cuda file. This is because when you add the +# cuda file to Visual Studio it knows that this file produces an object file +# and will link in the resulting object file automatically. +# +# This script will also generate a separate cmake script that is used at +# build time to invoke nvcc. This is for several reasons. +# +# 1. nvcc can return negative numbers as return values which confuses +# Visual Studio into thinking that the command succeeded. The script now +# checks the error codes and produces errors when there was a problem. +# +# 2. nvcc has been known to not delete incomplete results when it +# encounters problems. This confuses build systems into thinking the +# target was generated when in fact an unusable file exists. The script +# now deletes the output files if there was an error. +# +# 3. By putting all the options that affect the build into a file and then +# make the build rule dependent on the file, the output files will be +# regenerated when the options change. +# +# This script also looks at optional arguments STATIC, SHARED, or MODULE to +# determine when to target the object compilation for a shared library. +# BUILD_SHARED_LIBS is ignored in CUDA_WRAP_SRCS, but it is respected in +# CUDA_ADD_LIBRARY. On some systems special flags are added for building +# objects intended for shared libraries. A preprocessor macro, +# _EXPORTS is defined when a shared library compilation is +# detected. +# +# Flags passed into add_definitions with -D or /D are passed along to nvcc. +# +# +# +# The script defines the following variables:: +# +# CUDA_VERSION_MAJOR -- The major version of cuda as reported by nvcc. +# CUDA_VERSION_MINOR -- The minor version. +# CUDA_VERSION +# CUDA_VERSION_STRING -- CUDA_VERSION_MAJOR.CUDA_VERSION_MINOR +# +# CUDA_TOOLKIT_ROOT_DIR -- Path to the CUDA Toolkit (defined if not set). +# CUDA_SDK_ROOT_DIR -- Path to the CUDA SDK. Use this to find files in the +# SDK. This script will not directly support finding +# specific libraries or headers, as that isn't +# supported by NVIDIA. If you want to change +# libraries when the path changes see the +# FindCUDA.cmake script for an example of how to clear +# these variables. There are also examples of how to +# use the CUDA_SDK_ROOT_DIR to locate headers or +# libraries, if you so choose (at your own risk). +# CUDA_INCLUDE_DIRS -- Include directory for cuda headers. Added automatically +# for CUDA_ADD_EXECUTABLE and CUDA_ADD_LIBRARY. +# CUDA_LIBRARIES -- Cuda RT library. +# CUDA_CUFFT_LIBRARIES -- Device or emulation library for the Cuda FFT +# implementation (alternative to: +# CUDA_ADD_CUFFT_TO_TARGET macro) +# CUDA_CUBLAS_LIBRARIES -- Device or emulation library for the Cuda BLAS +# implementation (alterative to: +# CUDA_ADD_CUBLAS_TO_TARGET macro). +# CUDA_cudart_static_LIBRARY -- Statically linkable cuda runtime library. +# Only available for CUDA version 5.5+ +# CUDA_cupti_LIBRARY -- CUDA Profiling Tools Interface library. +# Only available for CUDA version 4.0+. +# CUDA_curand_LIBRARY -- CUDA Random Number Generation library. +# Only available for CUDA version 3.2+. +# CUDA_cusolver_LIBRARY -- CUDA Direct Solver library. +# Only available for CUDA version 7.0+. +# CUDA_cusparse_LIBRARY -- CUDA Sparse Matrix library. +# Only available for CUDA version 3.2+. +# CUDA_npp_LIBRARY -- NVIDIA Performance Primitives lib. +# Only available for CUDA version 4.0+. +# CUDA_nppc_LIBRARY -- NVIDIA Performance Primitives lib (core). +# Only available for CUDA version 5.5+. +# CUDA_nppi_LIBRARY -- NVIDIA Performance Primitives lib (image processing). +# Only available for CUDA version 5.5+. +# CUDA_npps_LIBRARY -- NVIDIA Performance Primitives lib (signal processing). +# Only available for CUDA version 5.5+. +# CUDA_nvcuvenc_LIBRARY -- CUDA Video Encoder library. +# Only available for CUDA version 3.2+. +# Windows only. +# CUDA_nvcuvid_LIBRARY -- CUDA Video Decoder library. +# Only available for CUDA version 3.2+. +# Windows only. +# + +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# Copyright (c) 2007-2009 +# Scientific Computing and Imaging Institute, University of Utah +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# +############################################################################### + +# FindCUDA.cmake + +# This macro helps us find the location of helper files we will need the full path to +macro(CUDA_FIND_HELPER_FILE _name _extension) + set(_full_name "${_name}.${_extension}") + # CMAKE_CURRENT_LIST_FILE contains the full path to the file currently being + # processed. Using this variable, we can pull out the current path, and + # provide a way to get access to the other files we need local to here. + get_filename_component(CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH) + set(CUDA_${_name} "${CMAKE_CURRENT_LIST_DIR}/FindCUDA/${_full_name}") + if(NOT EXISTS "${CUDA_${_name}}") + set(error_message "${_full_name} not found in ${CMAKE_CURRENT_LIST_DIR}/FindCUDA") + if(CUDA_FIND_REQUIRED) + message(FATAL_ERROR "${error_message}") + else() + if(NOT CUDA_FIND_QUIETLY) + message(STATUS "${error_message}") + endif() + endif() + endif() + # Set this variable as internal, so the user isn't bugged with it. + set(CUDA_${_name} ${CUDA_${_name}} CACHE INTERNAL "Location of ${_full_name}" FORCE) +endmacro() + +##################################################################### +## CUDA_INCLUDE_NVCC_DEPENDENCIES +## + +# So we want to try and include the dependency file if it exists. If +# it doesn't exist then we need to create an empty one, so we can +# include it. + +# If it does exist, then we need to check to see if all the files it +# depends on exist. If they don't then we should clear the dependency +# file and regenerate it later. This covers the case where a header +# file has disappeared or moved. + +macro(CUDA_INCLUDE_NVCC_DEPENDENCIES dependency_file) + set(CUDA_NVCC_DEPEND) + set(CUDA_NVCC_DEPEND_REGENERATE FALSE) + + + # Include the dependency file. Create it first if it doesn't exist . The + # INCLUDE puts a dependency that will force CMake to rerun and bring in the + # new info when it changes. DO NOT REMOVE THIS (as I did and spent a few + # hours figuring out why it didn't work. + if(NOT EXISTS ${dependency_file}) + file(WRITE ${dependency_file} "#FindCUDA.cmake generated file. Do not edit.\n") + endif() + # Always include this file to force CMake to run again next + # invocation and rebuild the dependencies. + #message("including dependency_file = ${dependency_file}") + include(${dependency_file}) + + # Now we need to verify the existence of all the included files + # here. If they aren't there we need to just blank this variable and + # make the file regenerate again. +# if(DEFINED CUDA_NVCC_DEPEND) +# message("CUDA_NVCC_DEPEND set") +# else() +# message("CUDA_NVCC_DEPEND NOT set") +# endif() + if(CUDA_NVCC_DEPEND) + #message("CUDA_NVCC_DEPEND found") + foreach(f ${CUDA_NVCC_DEPEND}) + # message("searching for ${f}") + if(NOT EXISTS ${f}) + #message("file ${f} not found") + set(CUDA_NVCC_DEPEND_REGENERATE TRUE) + endif() + endforeach() + else() + #message("CUDA_NVCC_DEPEND false") + # No dependencies, so regenerate the file. + set(CUDA_NVCC_DEPEND_REGENERATE TRUE) + endif() + + #message("CUDA_NVCC_DEPEND_REGENERATE = ${CUDA_NVCC_DEPEND_REGENERATE}") + # No incoming dependencies, so we need to generate them. Make the + # output depend on the dependency file itself, which should cause the + # rule to re-run. + if(CUDA_NVCC_DEPEND_REGENERATE) + set(CUDA_NVCC_DEPEND ${dependency_file}) + #message("Generating an empty dependency_file: ${dependency_file}") + file(WRITE ${dependency_file} "#FindCUDA.cmake generated file. Do not edit.\n") + endif() + +endmacro() + +############################################################################### +############################################################################### +# Setup variables' defaults +############################################################################### +############################################################################### + +# Allow the user to specify if the device code is supposed to be 32 or 64 bit. +if(CMAKE_SIZEOF_VOID_P EQUAL 8) + set(CUDA_64_BIT_DEVICE_CODE_DEFAULT ON) +else() + set(CUDA_64_BIT_DEVICE_CODE_DEFAULT OFF) +endif() +option(CUDA_64_BIT_DEVICE_CODE "Compile device code in 64 bit mode" ${CUDA_64_BIT_DEVICE_CODE_DEFAULT}) + +# Attach the build rule to the source file in VS. This option +option(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE "Attach the build rule to the CUDA source file. Enable only when the CUDA source file is added to at most one target." ON) + +# Prints out extra information about the cuda file during compilation +option(CUDA_BUILD_CUBIN "Generate and parse .cubin files in Device mode." OFF) + +# Set whether we are using emulation or device mode. +option(CUDA_BUILD_EMULATION "Build in Emulation mode" OFF) + +# Where to put the generated output. +set(CUDA_GENERATED_OUTPUT_DIR "" CACHE PATH "Directory to put all the output files. If blank it will default to the CMAKE_CURRENT_BINARY_DIR") + +# Parse HOST_COMPILATION mode. +option(CUDA_HOST_COMPILATION_CPP "Generated file extension" ON) + +# Extra user settable flags +set(CUDA_NVCC_FLAGS "" CACHE STRING "Semi-colon delimit multiple arguments.") + +if(CMAKE_GENERATOR MATCHES "Visual Studio") + set(CUDA_HOST_COMPILER "$(VCInstallDir)bin" CACHE FILEPATH "Host side compiler used by NVCC") +else() + if(APPLE + AND "${CMAKE_C_COMPILER_ID}" MATCHES "Clang" + AND "${CMAKE_C_COMPILER}" MATCHES "/cc$") + # Using cc which is symlink to clang may let NVCC think it is GCC and issue + # unhandled -dumpspecs option to clang. Also in case neither + # CMAKE_C_COMPILER is defined (project does not use C language) nor + # CUDA_HOST_COMPILER is specified manually we should skip -ccbin and let + # nvcc use its own default C compiler. + # Only care about this on APPLE with clang to avoid + # following symlinks to things like ccache + if(DEFINED CMAKE_C_COMPILER AND NOT DEFINED CUDA_HOST_COMPILER) + get_filename_component(c_compiler_realpath "${CMAKE_C_COMPILER}" REALPATH) + # if the real path does not end up being clang then + # go back to using CMAKE_C_COMPILER + if(NOT "${c_compiler_realpath}" MATCHES "/clang$") + set(c_compiler_realpath "${CMAKE_C_COMPILER}") + endif() + else() + set(c_compiler_realpath "") + endif() + set(CUDA_HOST_COMPILER "${c_compiler_realpath}" CACHE FILEPATH "Host side compiler used by NVCC") + else() + set(CUDA_HOST_COMPILER "${CMAKE_C_COMPILER}" + CACHE FILEPATH "Host side compiler used by NVCC") + endif() +endif() + +# Propagate the host flags to the host compiler via -Xcompiler +option(CUDA_PROPAGATE_HOST_FLAGS "Propage C/CXX_FLAGS and friends to the host compiler via -Xcompile" ON) + +# Enable CUDA_SEPARABLE_COMPILATION +option(CUDA_SEPARABLE_COMPILATION "Compile CUDA objects with separable compilation enabled. Requires CUDA 5.0+" OFF) + +# Specifies whether the commands used when compiling the .cu file will be printed out. +option(CUDA_VERBOSE_BUILD "Print out the commands run while compiling the CUDA source file. With the Makefile generator this defaults to VERBOSE variable specified on the command line, but can be forced on with this option." OFF) + +mark_as_advanced( + CUDA_64_BIT_DEVICE_CODE + CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE + CUDA_GENERATED_OUTPUT_DIR + CUDA_HOST_COMPILATION_CPP + CUDA_NVCC_FLAGS + CUDA_PROPAGATE_HOST_FLAGS + CUDA_BUILD_CUBIN + CUDA_BUILD_EMULATION + CUDA_VERBOSE_BUILD + CUDA_SEPARABLE_COMPILATION + ) + +# Makefile and similar generators don't define CMAKE_CONFIGURATION_TYPES, so we +# need to add another entry for the CMAKE_BUILD_TYPE. We also need to add the +# standerd set of 4 build types (Debug, MinSizeRel, Release, and RelWithDebInfo) +# for completeness. We need run this loop in order to accomodate the addition +# of extra configuration types. Duplicate entries will be removed by +# REMOVE_DUPLICATES. +set(CUDA_configuration_types ${CMAKE_CONFIGURATION_TYPES} ${CMAKE_BUILD_TYPE} Debug MinSizeRel Release RelWithDebInfo) +list(REMOVE_DUPLICATES CUDA_configuration_types) +foreach(config ${CUDA_configuration_types}) + string(TOUPPER ${config} config_upper) + set(CUDA_NVCC_FLAGS_${config_upper} "" CACHE STRING "Semi-colon delimit multiple arguments.") + mark_as_advanced(CUDA_NVCC_FLAGS_${config_upper}) +endforeach() + +############################################################################### +############################################################################### +# Locate CUDA, Set Build Type, etc. +############################################################################### +############################################################################### + +macro(cuda_unset_include_and_libraries) + unset(CUDA_TOOLKIT_INCLUDE CACHE) + unset(CUDA_CUDART_LIBRARY CACHE) + unset(CUDA_CUDA_LIBRARY CACHE) + # Make sure you run this before you unset CUDA_VERSION. + if(CUDA_VERSION VERSION_EQUAL "3.0") + # This only existed in the 3.0 version of the CUDA toolkit + unset(CUDA_CUDARTEMU_LIBRARY CACHE) + endif() + unset(CUDA_cudart_static_LIBRARY CACHE) + unset(CUDA_cublas_LIBRARY CACHE) + unset(CUDA_cublasemu_LIBRARY CACHE) + unset(CUDA_cufft_LIBRARY CACHE) + unset(CUDA_cufftemu_LIBRARY CACHE) + unset(CUDA_cupti_LIBRARY CACHE) + unset(CUDA_curand_LIBRARY CACHE) + unset(CUDA_cusolver_LIBRARY CACHE) + unset(CUDA_cusparse_LIBRARY CACHE) + unset(CUDA_npp_LIBRARY CACHE) + unset(CUDA_nppc_LIBRARY CACHE) + unset(CUDA_nppi_LIBRARY CACHE) + unset(CUDA_npps_LIBRARY CACHE) + unset(CUDA_nvcuvenc_LIBRARY CACHE) + unset(CUDA_nvcuvid_LIBRARY CACHE) + + unset(CUDA_USE_STATIC_CUDA_RUNTIME CACHE) +endmacro() + +# Check to see if the CUDA_TOOLKIT_ROOT_DIR and CUDA_SDK_ROOT_DIR have changed, +# if they have then clear the cache variables, so that will be detected again. +if(NOT "${CUDA_TOOLKIT_ROOT_DIR}" STREQUAL "${CUDA_TOOLKIT_ROOT_DIR_INTERNAL}") + unset(CUDA_TOOLKIT_TARGET_DIR CACHE) + unset(CUDA_NVCC_EXECUTABLE CACHE) + cuda_unset_include_and_libraries() + unset(CUDA_VERSION CACHE) +endif() + +if(NOT "${CUDA_TOOLKIT_TARGET_DIR}" STREQUAL "${CUDA_TOOLKIT_TARGET_DIR_INTERNAL}") + cuda_unset_include_and_libraries() +endif() + +if(NOT "${CUDA_SDK_ROOT_DIR}" STREQUAL "${CUDA_SDK_ROOT_DIR_INTERNAL}") + # No specific variables to catch. Use this kind of code before calling + # find_package(CUDA) to clean up any variables that may depend on this path. + + # unset(MY_SPECIAL_CUDA_SDK_INCLUDE_DIR CACHE) + # unset(MY_SPECIAL_CUDA_SDK_LIBRARY CACHE) +endif() + +# Search for the cuda distribution. +if(NOT CUDA_TOOLKIT_ROOT_DIR) + + # Search in the CUDA_BIN_PATH first. + find_path(CUDA_TOOLKIT_ROOT_DIR + NAMES nvcc nvcc.exe + PATHS + ENV CUDA_PATH + ENV CUDA_BIN_PATH + PATH_SUFFIXES bin bin64 + DOC "Toolkit location." + NO_DEFAULT_PATH + ) + # Now search default paths + find_path(CUDA_TOOLKIT_ROOT_DIR + NAMES nvcc nvcc.exe + PATHS /usr/local/bin + /usr/local/cuda/bin + DOC "Toolkit location." + ) + + if (CUDA_TOOLKIT_ROOT_DIR) + string(REGEX REPLACE "[/\\\\]?bin[64]*[/\\\\]?$" "" CUDA_TOOLKIT_ROOT_DIR ${CUDA_TOOLKIT_ROOT_DIR}) + # We need to force this back into the cache. + set(CUDA_TOOLKIT_ROOT_DIR ${CUDA_TOOLKIT_ROOT_DIR} CACHE PATH "Toolkit location." FORCE) + endif() + if (NOT EXISTS ${CUDA_TOOLKIT_ROOT_DIR}) + if(CUDA_FIND_REQUIRED) + message(FATAL_ERROR "Specify CUDA_TOOLKIT_ROOT_DIR") + elseif(NOT CUDA_FIND_QUIETLY) + message("CUDA_TOOLKIT_ROOT_DIR not found or specified") + endif() + endif () +endif () + +# CUDA_NVCC_EXECUTABLE +find_program(CUDA_NVCC_EXECUTABLE + NAMES nvcc + PATHS "${CUDA_TOOLKIT_ROOT_DIR}" + ENV CUDA_PATH + ENV CUDA_BIN_PATH + PATH_SUFFIXES bin bin64 + NO_DEFAULT_PATH + ) +# Search default search paths, after we search our own set of paths. +find_program(CUDA_NVCC_EXECUTABLE nvcc) +mark_as_advanced(CUDA_NVCC_EXECUTABLE) + +if(CUDA_NVCC_EXECUTABLE AND NOT CUDA_VERSION) + # Compute the version. + execute_process (COMMAND ${CUDA_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT) + string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\1" CUDA_VERSION_MAJOR ${NVCC_OUT}) + string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\2" CUDA_VERSION_MINOR ${NVCC_OUT}) + set(CUDA_VERSION "${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}" CACHE STRING "Version of CUDA as computed from nvcc.") + mark_as_advanced(CUDA_VERSION) +else() + # Need to set these based off of the cached value + string(REGEX REPLACE "([0-9]+)\\.([0-9]+).*" "\\1" CUDA_VERSION_MAJOR "${CUDA_VERSION}") + string(REGEX REPLACE "([0-9]+)\\.([0-9]+).*" "\\2" CUDA_VERSION_MINOR "${CUDA_VERSION}") +endif() + +# Always set this convenience variable +set(CUDA_VERSION_STRING "${CUDA_VERSION}") + +# Support for arm cross compilation with CUDA 5.5 +if(CUDA_VERSION VERSION_GREATER "5.0" AND CMAKE_CROSSCOMPILING AND CMAKE_SYSTEM_PROCESSOR MATCHES "arm" AND EXISTS "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-gnueabihf") + set(CUDA_TOOLKIT_TARGET_DIR "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-gnueabihf" CACHE PATH "Toolkit target location.") +else() + set(CUDA_TOOLKIT_TARGET_DIR "${CUDA_TOOLKIT_ROOT_DIR}" CACHE PATH "Toolkit target location.") +endif() +mark_as_advanced(CUDA_TOOLKIT_TARGET_DIR) + +# Target CPU architecture +if(CUDA_VERSION VERSION_GREATER "5.0" AND CMAKE_CROSSCOMPILING AND CMAKE_SYSTEM_PROCESSOR MATCHES "arm") + set(_cuda_target_cpu_arch_initial "ARM") +else() + set(_cuda_target_cpu_arch_initial "") +endif() +set(CUDA_TARGET_CPU_ARCH ${_cuda_target_cpu_arch_initial} CACHE STRING "Specify the name of the class of CPU architecture for which the input files must be compiled.") +mark_as_advanced(CUDA_TARGET_CPU_ARCH) + +# CUDA_TOOLKIT_INCLUDE +find_path(CUDA_TOOLKIT_INCLUDE + device_functions.h # Header included in toolkit + PATHS "${CUDA_TOOLKIT_TARGET_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}" + ENV CUDA_PATH + ENV CUDA_INC_PATH + PATH_SUFFIXES include + NO_DEFAULT_PATH + ) +# Search default search paths, after we search our own set of paths. +find_path(CUDA_TOOLKIT_INCLUDE device_functions.h) +mark_as_advanced(CUDA_TOOLKIT_INCLUDE) + +# Set the user list of include dir to nothing to initialize it. +set (CUDA_NVCC_INCLUDE_ARGS_USER "") +set (CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE}) + +macro(cuda_find_library_local_first_with_path_ext _var _names _doc _path_ext ) + if(CMAKE_SIZEOF_VOID_P EQUAL 8) + # CUDA 3.2+ on Windows moved the library directories, so we need the new + # and old paths. + set(_cuda_64bit_lib_dir "${_path_ext}lib/x64" "${_path_ext}lib64" "${_path_ext}libx64" ) + endif() + # CUDA 3.2+ on Windows moved the library directories, so we need to new + # (lib/Win32) and the old path (lib). + find_library(${_var} + NAMES ${_names} + PATHS "${CUDA_TOOLKIT_TARGET_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}" + ENV CUDA_PATH + ENV CUDA_LIB_PATH + PATH_SUFFIXES ${_cuda_64bit_lib_dir} "${_path_ext}lib/Win32" "${_path_ext}lib" "${_path_ext}libWin32" + DOC ${_doc} + NO_DEFAULT_PATH + ) + # Search default search paths, after we search our own set of paths. + find_library(${_var} + NAMES ${_names} + PATHS "/usr/lib/nvidia-current" + DOC ${_doc} + ) +endmacro() + +macro(cuda_find_library_local_first _var _names _doc) + cuda_find_library_local_first_with_path_ext( "${_var}" "${_names}" "${_doc}" "" ) +endmacro() + +macro(find_library_local_first _var _names _doc ) + cuda_find_library_local_first( "${_var}" "${_names}" "${_doc}" "" ) +endmacro() + + +# CUDA_LIBRARIES +cuda_find_library_local_first(CUDA_CUDART_LIBRARY cudart "\"cudart\" library") +if(CUDA_VERSION VERSION_EQUAL "3.0") + # The cudartemu library only existed for the 3.0 version of CUDA. + cuda_find_library_local_first(CUDA_CUDARTEMU_LIBRARY cudartemu "\"cudartemu\" library") + mark_as_advanced( + CUDA_CUDARTEMU_LIBRARY + ) +endif() +if(NOT CUDA_VERSION VERSION_LESS "5.5") + cuda_find_library_local_first(CUDA_cudart_static_LIBRARY cudart_static "static CUDA runtime library") + mark_as_advanced(CUDA_cudart_static_LIBRARY) +endif() +if(CUDA_cudart_static_LIBRARY) + # Set whether to use the static cuda runtime. + option(CUDA_USE_STATIC_CUDA_RUNTIME "Use the static version of the CUDA runtime library if available" ON) +else() + option(CUDA_USE_STATIC_CUDA_RUNTIME "Use the static version of the CUDA runtime library if available" OFF) +endif() + +if(CUDA_USE_STATIC_CUDA_RUNTIME) + if(UNIX) + # Check for the dependent libraries. Here we look for pthreads. + if (DEFINED CMAKE_THREAD_PREFER_PTHREAD) + set(_cuda_cmake_thread_prefer_pthread ${CMAKE_THREAD_PREFER_PTHREAD}) + endif() + set(CMAKE_THREAD_PREFER_PTHREAD 1) + + # Many of the FindXYZ CMake comes with makes use of try_compile with int main(){return 0;} + # as the source file. Unfortunately this causes a warning with -Wstrict-prototypes and + # -Werror causes the try_compile to fail. We will just temporarily disable other flags + # when doing the find_package command here. + set(_cuda_cmake_c_flags ${CMAKE_C_FLAGS}) + set(CMAKE_C_FLAGS "-fPIC") + find_package(Threads REQUIRED) + set(CMAKE_C_FLAGS ${_cuda_cmake_c_flags}) + + if (DEFINED _cuda_cmake_thread_prefer_pthread) + set(CMAKE_THREAD_PREFER_PTHREAD ${_cuda_cmake_thread_prefer_pthread}) + unset(_cuda_cmake_thread_prefer_pthread) + else() + unset(CMAKE_THREAD_PREFER_PTHREAD) + endif() + if (NOT APPLE) + # Here is librt that has things such as, clock_gettime, shm_open, and shm_unlink. + find_library(CUDA_rt_LIBRARY rt) + find_library(CUDA_dl_LIBRARY dl) + if (NOT CUDA_rt_LIBRARY) + message(WARNING "Expecting to find librt for libcudart_static, but didn't find it.") + endif() + if (NOT CUDA_dl_LIBRARY) + message(WARNING "Expecting to find libdl for libcudart_static, but didn't find it.") + endif() + endif() + endif() +endif() + +# CUPTI library showed up in cuda toolkit 4.0 +if(NOT CUDA_VERSION VERSION_LESS "4.0") + cuda_find_library_local_first_with_path_ext(CUDA_cupti_LIBRARY cupti "\"cupti\" library" "extras/CUPTI/") + mark_as_advanced(CUDA_cupti_LIBRARY) +endif() + +# Set the CUDA_LIBRARIES variable. This is the set of stuff to link against if you are +# using the CUDA runtime. For the dynamic version of the runtime, most of the +# dependencies are brough in, but for the static version there are additional libraries +# and linker commands needed. +# Initialize to empty +set(CUDA_LIBRARIES) + +# If we are using emulation mode and we found the cudartemu library then use +# that one instead of cudart. +if(CUDA_BUILD_EMULATION AND CUDA_CUDARTEMU_LIBRARY) + list(APPEND CUDA_LIBRARIES ${CUDA_CUDARTEMU_LIBRARY}) +elseif(CUDA_USE_STATIC_CUDA_RUNTIME AND CUDA_cudart_static_LIBRARY) + list(APPEND CUDA_LIBRARIES ${CUDA_cudart_static_LIBRARY} ${CMAKE_THREAD_LIBS_INIT}) + if (CUDA_rt_LIBRARY) + list(APPEND CUDA_LIBRARIES ${CUDA_rt_LIBRARY}) + endif() + if (CUDA_dl_LIBRARY) + list(APPEND CUDA_LIBRARIES ${CUDA_dl_LIBRARY}) + endif() + if(APPLE) + # We need to add the default path to the driver (libcuda.dylib) as an rpath, so that + # the static cuda runtime can find it at runtime. + list(APPEND CUDA_LIBRARIES -Wl,-rpath,/usr/local/cuda/lib) + endif() +else() + list(APPEND CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY}) +endif() + +# 1.1 toolkit on linux doesn't appear to have a separate library on +# some platforms. +cuda_find_library_local_first(CUDA_CUDA_LIBRARY cuda "\"cuda\" library (older versions only).") + +mark_as_advanced( + CUDA_CUDA_LIBRARY + CUDA_CUDART_LIBRARY + ) + +####################### +# Look for some of the toolkit helper libraries +macro(FIND_CUDA_HELPER_LIBS _name) + cuda_find_library_local_first(CUDA_${_name}_LIBRARY ${_name} "\"${_name}\" library") + mark_as_advanced(CUDA_${_name}_LIBRARY) +endmacro() + +####################### +# Disable emulation for v3.1 onward +if(CUDA_VERSION VERSION_GREATER "3.0") + if(CUDA_BUILD_EMULATION) + message(FATAL_ERROR "CUDA_BUILD_EMULATION is not supported in version 3.1 and onwards. You must disable it to proceed. You have version ${CUDA_VERSION}.") + endif() +endif() + +# Search for additional CUDA toolkit libraries. +if(CUDA_VERSION VERSION_LESS "3.1") + # Emulation libraries aren't available in version 3.1 onward. + find_cuda_helper_libs(cufftemu) + find_cuda_helper_libs(cublasemu) +endif() +find_cuda_helper_libs(cufft) +find_cuda_helper_libs(cublas) +if(NOT CUDA_VERSION VERSION_LESS "3.2") + # cusparse showed up in version 3.2 + find_cuda_helper_libs(cusparse) + find_cuda_helper_libs(curand) + if (WIN32) + find_cuda_helper_libs(nvcuvenc) + find_cuda_helper_libs(nvcuvid) + endif() +endif() +if(CUDA_VERSION VERSION_GREATER "5.0") + # In CUDA 5.5 NPP was splitted onto 3 separate libraries. + find_cuda_helper_libs(nppc) + find_cuda_helper_libs(nppi) + find_cuda_helper_libs(npps) + set(CUDA_npp_LIBRARY "${CUDA_nppc_LIBRARY};${CUDA_nppi_LIBRARY};${CUDA_npps_LIBRARY}") +elseif(NOT CUDA_VERSION VERSION_LESS "4.0") + find_cuda_helper_libs(npp) +endif() +if(NOT CUDA_VERSION VERSION_LESS "7.0") + # cusolver showed up in version 7.0 + find_cuda_helper_libs(cusolver) +endif() + +if (CUDA_BUILD_EMULATION) + set(CUDA_CUFFT_LIBRARIES ${CUDA_cufftemu_LIBRARY}) + set(CUDA_CUBLAS_LIBRARIES ${CUDA_cublasemu_LIBRARY}) +else() + set(CUDA_CUFFT_LIBRARIES ${CUDA_cufft_LIBRARY}) + set(CUDA_CUBLAS_LIBRARIES ${CUDA_cublas_LIBRARY}) +endif() + +######################## +# Look for the SDK stuff. As of CUDA 3.0 NVSDKCUDA_ROOT has been replaced with +# NVSDKCOMPUTE_ROOT with the old CUDA C contents moved into the C subdirectory +find_path(CUDA_SDK_ROOT_DIR common/inc/cutil.h + HINTS + "$ENV{NVSDKCOMPUTE_ROOT}/C" + ENV NVSDKCUDA_ROOT + "[HKEY_LOCAL_MACHINE\\SOFTWARE\\NVIDIA Corporation\\Installed Products\\NVIDIA SDK 10\\Compute;InstallDir]" + PATHS + "/Developer/GPU\ Computing/C" + ) + +# Keep the CUDA_SDK_ROOT_DIR first in order to be able to override the +# environment variables. +set(CUDA_SDK_SEARCH_PATH + "${CUDA_SDK_ROOT_DIR}" + "${CUDA_TOOLKIT_ROOT_DIR}/local/NVSDK0.2" + "${CUDA_TOOLKIT_ROOT_DIR}/NVSDK0.2" + "${CUDA_TOOLKIT_ROOT_DIR}/NV_CUDA_SDK" + "$ENV{HOME}/NVIDIA_CUDA_SDK" + "$ENV{HOME}/NVIDIA_CUDA_SDK_MACOSX" + "/Developer/CUDA" + ) + +# Example of how to find an include file from the CUDA_SDK_ROOT_DIR + +# find_path(CUDA_CUT_INCLUDE_DIR +# cutil.h +# PATHS ${CUDA_SDK_SEARCH_PATH} +# PATH_SUFFIXES "common/inc" +# DOC "Location of cutil.h" +# NO_DEFAULT_PATH +# ) +# # Now search system paths +# find_path(CUDA_CUT_INCLUDE_DIR cutil.h DOC "Location of cutil.h") + +# mark_as_advanced(CUDA_CUT_INCLUDE_DIR) + + +# Example of how to find a library in the CUDA_SDK_ROOT_DIR + +# # cutil library is called cutil64 for 64 bit builds on windows. We don't want +# # to get these confused, so we are setting the name based on the word size of +# # the build. + +# if(CMAKE_SIZEOF_VOID_P EQUAL 8) +# set(cuda_cutil_name cutil64) +# else() +# set(cuda_cutil_name cutil32) +# endif() + +# find_library(CUDA_CUT_LIBRARY +# NAMES cutil ${cuda_cutil_name} +# PATHS ${CUDA_SDK_SEARCH_PATH} +# # The new version of the sdk shows up in common/lib, but the old one is in lib +# PATH_SUFFIXES "common/lib" "lib" +# DOC "Location of cutil library" +# NO_DEFAULT_PATH +# ) +# # Now search system paths +# find_library(CUDA_CUT_LIBRARY NAMES cutil ${cuda_cutil_name} DOC "Location of cutil library") +# mark_as_advanced(CUDA_CUT_LIBRARY) +# set(CUDA_CUT_LIBRARIES ${CUDA_CUT_LIBRARY}) + + + +############################# +# Check for required components +set(CUDA_FOUND TRUE) + +set(CUDA_TOOLKIT_ROOT_DIR_INTERNAL "${CUDA_TOOLKIT_ROOT_DIR}" CACHE INTERNAL + "This is the value of the last time CUDA_TOOLKIT_ROOT_DIR was set successfully." FORCE) +set(CUDA_TOOLKIT_TARGET_DIR_INTERNAL "${CUDA_TOOLKIT_TARGET_DIR}" CACHE INTERNAL + "This is the value of the last time CUDA_TOOLKIT_TARGET_DIR was set successfully." FORCE) +set(CUDA_SDK_ROOT_DIR_INTERNAL "${CUDA_SDK_ROOT_DIR}" CACHE INTERNAL + "This is the value of the last time CUDA_SDK_ROOT_DIR was set successfully." FORCE) + +include(${CMAKE_CURRENT_LIST_DIR}/FindPackageHandleStandardArgs.cmake) +find_package_handle_standard_args(CUDA + REQUIRED_VARS + CUDA_TOOLKIT_ROOT_DIR + CUDA_NVCC_EXECUTABLE + CUDA_INCLUDE_DIRS + CUDA_CUDART_LIBRARY + VERSION_VAR + CUDA_VERSION + ) + + + +############################################################################### +############################################################################### +# Macros +############################################################################### +############################################################################### + +############################################################################### +# Add include directories to pass to the nvcc command. +macro(CUDA_INCLUDE_DIRECTORIES) + foreach(dir ${ARGN}) + list(APPEND CUDA_NVCC_INCLUDE_ARGS_USER -I${dir}) + endforeach() +endmacro() + + +############################################################################## +cuda_find_helper_file(parse_cubin cmake) +cuda_find_helper_file(make2cmake cmake) +cuda_find_helper_file(run_nvcc cmake) + +############################################################################## +# Separate the OPTIONS out from the sources +# +macro(CUDA_GET_SOURCES_AND_OPTIONS _sources _cmake_options _options) + set( ${_sources} ) + set( ${_cmake_options} ) + set( ${_options} ) + set( _found_options FALSE ) + foreach(arg ${ARGN}) + if("x${arg}" STREQUAL "xOPTIONS") + set( _found_options TRUE ) + elseif( + "x${arg}" STREQUAL "xWIN32" OR + "x${arg}" STREQUAL "xMACOSX_BUNDLE" OR + "x${arg}" STREQUAL "xEXCLUDE_FROM_ALL" OR + "x${arg}" STREQUAL "xSTATIC" OR + "x${arg}" STREQUAL "xSHARED" OR + "x${arg}" STREQUAL "xMODULE" + ) + list(APPEND ${_cmake_options} ${arg}) + else() + if ( _found_options ) + list(APPEND ${_options} ${arg}) + else() + # Assume this is a file + list(APPEND ${_sources} ${arg}) + endif() + endif() + endforeach() +endmacro() + +############################################################################## +# Parse the OPTIONS from ARGN and set the variables prefixed by _option_prefix +# +macro(CUDA_PARSE_NVCC_OPTIONS _option_prefix) + set( _found_config ) + foreach(arg ${ARGN}) + # Determine if we are dealing with a perconfiguration flag + foreach(config ${CUDA_configuration_types}) + string(TOUPPER ${config} config_upper) + if (arg STREQUAL "${config_upper}") + set( _found_config _${arg}) + # Set arg to nothing to keep it from being processed further + set( arg ) + endif() + endforeach() + + if ( arg ) + list(APPEND ${_option_prefix}${_found_config} "${arg}") + endif() + endforeach() +endmacro() + +############################################################################## +# Helper to add the include directory for CUDA only once +function(CUDA_ADD_CUDA_INCLUDE_ONCE) + get_directory_property(_include_directories INCLUDE_DIRECTORIES) + set(_add TRUE) + if(_include_directories) + foreach(dir ${_include_directories}) + if("${dir}" STREQUAL "${CUDA_INCLUDE_DIRS}") + set(_add FALSE) + endif() + endforeach() + endif() + if(_add) + include_directories(${CUDA_INCLUDE_DIRS}) + endif() +endfunction() + +function(CUDA_BUILD_SHARED_LIBRARY shared_flag) + set(cmake_args ${ARGN}) + # If SHARED, MODULE, or STATIC aren't already in the list of arguments, then + # add SHARED or STATIC based on the value of BUILD_SHARED_LIBS. + list(FIND cmake_args SHARED _cuda_found_SHARED) + list(FIND cmake_args MODULE _cuda_found_MODULE) + list(FIND cmake_args STATIC _cuda_found_STATIC) + if( _cuda_found_SHARED GREATER -1 OR + _cuda_found_MODULE GREATER -1 OR + _cuda_found_STATIC GREATER -1) + set(_cuda_build_shared_libs) + else() + if (BUILD_SHARED_LIBS) + set(_cuda_build_shared_libs SHARED) + else() + set(_cuda_build_shared_libs STATIC) + endif() + endif() + set(${shared_flag} ${_cuda_build_shared_libs} PARENT_SCOPE) +endfunction() + +############################################################################## +# Helper to avoid clashes of files with the same basename but different paths. +# This doesn't attempt to do exactly what CMake internals do, which is to only +# add this path when there is a conflict, since by the time a second collision +# in names is detected it's already too late to fix the first one. For +# consistency sake the relative path will be added to all files. +function(CUDA_COMPUTE_BUILD_PATH path build_path) + #message("CUDA_COMPUTE_BUILD_PATH([${path}] ${build_path})") + # Only deal with CMake style paths from here on out + file(TO_CMAKE_PATH "${path}" bpath) + if (IS_ABSOLUTE "${bpath}") + # Absolute paths are generally unnessary, especially if something like + # file(GLOB_RECURSE) is used to pick up the files. + + string(FIND "${bpath}" "${CMAKE_CURRENT_BINARY_DIR}" _binary_dir_pos) + if (_binary_dir_pos EQUAL 0) + file(RELATIVE_PATH bpath "${CMAKE_CURRENT_BINARY_DIR}" "${bpath}") + else() + file(RELATIVE_PATH bpath "${CMAKE_CURRENT_SOURCE_DIR}" "${bpath}") + endif() + endif() + + # This recipe is from cmLocalGenerator::CreateSafeUniqueObjectFileName in the + # CMake source. + + # Remove leading / + string(REGEX REPLACE "^[/]+" "" bpath "${bpath}") + # Avoid absolute paths by removing ':' + string(REPLACE ":" "_" bpath "${bpath}") + # Avoid relative paths that go up the tree + string(REPLACE "../" "__/" bpath "${bpath}") + # Avoid spaces + string(REPLACE " " "_" bpath "${bpath}") + + # Strip off the filename. I wait until here to do it, since removin the + # basename can make a path that looked like path/../basename turn into + # path/.. (notice the trailing slash). + get_filename_component(bpath "${bpath}" PATH) + + set(${build_path} "${bpath}" PARENT_SCOPE) + #message("${build_path} = ${bpath}") +endfunction() + +############################################################################## +# This helper macro populates the following variables and setups up custom +# commands and targets to invoke the nvcc compiler to generate C or PTX source +# dependent upon the format parameter. The compiler is invoked once with -M +# to generate a dependency file and a second time with -cuda or -ptx to generate +# a .cpp or .ptx file. +# INPUT: +# cuda_target - Target name +# format - PTX, CUBIN, FATBIN or OBJ +# FILE1 .. FILEN - The remaining arguments are the sources to be wrapped. +# OPTIONS - Extra options to NVCC +# OUTPUT: +# generated_files - List of generated files +############################################################################## +############################################################################## + +macro(CUDA_WRAP_SRCS cuda_target format generated_files) + + # If CMake doesn't support separable compilation, complain + if(CUDA_SEPARABLE_COMPILATION AND CMAKE_VERSION VERSION_LESS "2.8.10.1") + message(SEND_ERROR "CUDA_SEPARABLE_COMPILATION isn't supported for CMake versions less than 2.8.10.1") + endif() + + # Set up all the command line flags here, so that they can be overridden on a per target basis. + + set(nvcc_flags "") + + # Emulation if the card isn't present. + if (CUDA_BUILD_EMULATION) + # Emulation. + set(nvcc_flags ${nvcc_flags} --device-emulation -D_DEVICEEMU -g) + else() + # Device mode. No flags necessary. + endif() + + if(CUDA_HOST_COMPILATION_CPP) + set(CUDA_C_OR_CXX CXX) + else() + if(CUDA_VERSION VERSION_LESS "3.0") + set(nvcc_flags ${nvcc_flags} --host-compilation C) + else() + message(WARNING "--host-compilation flag is deprecated in CUDA version >= 3.0. Removing --host-compilation C flag" ) + endif() + set(CUDA_C_OR_CXX C) + endif() + + set(generated_extension ${CMAKE_${CUDA_C_OR_CXX}_OUTPUT_EXTENSION}) + + if(CUDA_64_BIT_DEVICE_CODE) + set(nvcc_flags ${nvcc_flags} -m64) + else() + set(nvcc_flags ${nvcc_flags} -m32) + endif() + + if(CUDA_TARGET_CPU_ARCH) + set(nvcc_flags ${nvcc_flags} "--target-cpu-architecture=${CUDA_TARGET_CPU_ARCH}") + endif() + + # This needs to be passed in at this stage, because VS needs to fill out the + # value of VCInstallDir from within VS. Note that CCBIN is only used if + # -ccbin or --compiler-bindir isn't used and CUDA_HOST_COMPILER matches + # $(VCInstallDir)/bin. + if(CMAKE_GENERATOR MATCHES "Visual Studio") + set(ccbin_flags -D "\"CCBIN:PATH=$(VCInstallDir)bin\"" ) + else() + set(ccbin_flags) + endif() + + # Figure out which configure we will use and pass that in as an argument to + # the script. We need to defer the decision until compilation time, because + # for VS projects we won't know if we are making a debug or release build + # until build time. + if(CMAKE_GENERATOR MATCHES "Visual Studio") + set( CUDA_build_configuration "$(ConfigurationName)" ) + else() + set( CUDA_build_configuration "${CMAKE_BUILD_TYPE}") + endif() + + # Initialize our list of includes with the user ones followed by the CUDA system ones. + set(CUDA_NVCC_INCLUDE_ARGS ${CUDA_NVCC_INCLUDE_ARGS_USER} "-I${CUDA_INCLUDE_DIRS}") + # Get the include directories for this directory and use them for our nvcc command. + # Remove duplicate entries which may be present since include_directories + # in CMake >= 2.8.8 does not remove them. + get_directory_property(CUDA_NVCC_INCLUDE_DIRECTORIES INCLUDE_DIRECTORIES) + list(REMOVE_DUPLICATES CUDA_NVCC_INCLUDE_DIRECTORIES) + if(CUDA_NVCC_INCLUDE_DIRECTORIES) + foreach(dir ${CUDA_NVCC_INCLUDE_DIRECTORIES}) + list(APPEND CUDA_NVCC_INCLUDE_ARGS -I${dir}) + endforeach() + endif() + + # Reset these variables + set(CUDA_WRAP_OPTION_NVCC_FLAGS) + foreach(config ${CUDA_configuration_types}) + string(TOUPPER ${config} config_upper) + set(CUDA_WRAP_OPTION_NVCC_FLAGS_${config_upper}) + endforeach() + + CUDA_GET_SOURCES_AND_OPTIONS(_cuda_wrap_sources _cuda_wrap_cmake_options _cuda_wrap_options ${ARGN}) + CUDA_PARSE_NVCC_OPTIONS(CUDA_WRAP_OPTION_NVCC_FLAGS ${_cuda_wrap_options}) + + # Figure out if we are building a shared library. BUILD_SHARED_LIBS is + # respected in CUDA_ADD_LIBRARY. + set(_cuda_build_shared_libs FALSE) + # SHARED, MODULE + list(FIND _cuda_wrap_cmake_options SHARED _cuda_found_SHARED) + list(FIND _cuda_wrap_cmake_options MODULE _cuda_found_MODULE) + if(_cuda_found_SHARED GREATER -1 OR _cuda_found_MODULE GREATER -1) + set(_cuda_build_shared_libs TRUE) + endif() + # STATIC + list(FIND _cuda_wrap_cmake_options STATIC _cuda_found_STATIC) + if(_cuda_found_STATIC GREATER -1) + set(_cuda_build_shared_libs FALSE) + endif() + + # CUDA_HOST_FLAGS + if(_cuda_build_shared_libs) + # If we are setting up code for a shared library, then we need to add extra flags for + # compiling objects for shared libraries. + set(CUDA_HOST_SHARED_FLAGS ${CMAKE_SHARED_LIBRARY_${CUDA_C_OR_CXX}_FLAGS}) + else() + set(CUDA_HOST_SHARED_FLAGS) + endif() + # Only add the CMAKE_{C,CXX}_FLAGS if we are propagating host flags. We + # always need to set the SHARED_FLAGS, though. + if(CUDA_PROPAGATE_HOST_FLAGS) + set(_cuda_host_flags "set(CMAKE_HOST_FLAGS ${CMAKE_${CUDA_C_OR_CXX}_FLAGS} ${CUDA_HOST_SHARED_FLAGS})") + else() + set(_cuda_host_flags "set(CMAKE_HOST_FLAGS ${CUDA_HOST_SHARED_FLAGS})") + endif() + + set(_cuda_nvcc_flags_config "# Build specific configuration flags") + # Loop over all the configuration types to generate appropriate flags for run_nvcc.cmake + foreach(config ${CUDA_configuration_types}) + string(TOUPPER ${config} config_upper) + # CMAKE_FLAGS are strings and not lists. By not putting quotes around CMAKE_FLAGS + # we convert the strings to lists (like we want). + + if(CUDA_PROPAGATE_HOST_FLAGS) + # nvcc chokes on -g3 in versions previous to 3.0, so replace it with -g + set(_cuda_fix_g3 FALSE) + + if(CMAKE_COMPILER_IS_GNUCC) + if (CUDA_VERSION VERSION_LESS "3.0" OR + CUDA_VERSION VERSION_EQUAL "4.1" OR + CUDA_VERSION VERSION_EQUAL "4.2" + ) + set(_cuda_fix_g3 TRUE) + endif() + endif() + if(_cuda_fix_g3) + string(REPLACE "-g3" "-g" _cuda_C_FLAGS "${CMAKE_${CUDA_C_OR_CXX}_FLAGS_${config_upper}}") + else() + set(_cuda_C_FLAGS "${CMAKE_${CUDA_C_OR_CXX}_FLAGS_${config_upper}}") + endif() + + set(_cuda_host_flags "${_cuda_host_flags}\nset(CMAKE_HOST_FLAGS_${config_upper} ${_cuda_C_FLAGS})") + endif() + + # Note that if we ever want CUDA_NVCC_FLAGS_ to be string (instead of a list + # like it is currently), we can remove the quotes around the + # ${CUDA_NVCC_FLAGS_${config_upper}} variable like the CMAKE_HOST_FLAGS_ variable. + set(_cuda_nvcc_flags_config "${_cuda_nvcc_flags_config}\nset(CUDA_NVCC_FLAGS_${config_upper} ${CUDA_NVCC_FLAGS_${config_upper}} ;; ${CUDA_WRAP_OPTION_NVCC_FLAGS_${config_upper}})") + endforeach() + + # Process the C++11 flag. If the host sets the flag, we need to add it to nvcc and + # remove it from the host. This is because -Xcompile -std=c++ will choke nvcc (it uses + # the C preprocessor). In order to get this to work correctly, we need to use nvcc's + # specific c++11 flag. + if( "${_cuda_host_flags}" MATCHES "-std=c\\+\\+11") + # Add the c++11 flag to nvcc if it isn't already present. Note that we only look at + # the main flag instead of the configuration specific flags. + if( NOT "${CUDA_NVCC_FLAGS}" MATCHES "-std;c\\+\\+11" ) + list(APPEND nvcc_flags --std c++11) + endif() + string(REGEX REPLACE "[-]+std=c\\+\\+11" "" _cuda_host_flags "${_cuda_host_flags}") + endif() + + # Get the list of definitions from the directory property + get_directory_property(CUDA_NVCC_DEFINITIONS COMPILE_DEFINITIONS) + if(CUDA_NVCC_DEFINITIONS) + foreach(_definition ${CUDA_NVCC_DEFINITIONS}) + list(APPEND nvcc_flags "-D${_definition}") + endforeach() + endif() + + if(_cuda_build_shared_libs) + list(APPEND nvcc_flags "-D${cuda_target}_EXPORTS") + endif() + + # Reset the output variable + set(_cuda_wrap_generated_files "") + + # Iterate over the macro arguments and create custom + # commands for all the .cu files. + foreach(file ${ARGN}) + # Ignore any file marked as a HEADER_FILE_ONLY + get_source_file_property(_is_header ${file} HEADER_FILE_ONLY) + # Allow per source file overrides of the format. Also allows compiling non-.cu files. + get_source_file_property(_cuda_source_format ${file} CUDA_SOURCE_PROPERTY_FORMAT) + if((${file} MATCHES "\\.cu$" OR _cuda_source_format) AND NOT _is_header) + + if(NOT _cuda_source_format) + set(_cuda_source_format ${format}) + endif() + # If file isn't a .cu file, we need to tell nvcc to treat it as such. + if(NOT ${file} MATCHES "\\.cu$") + set(cuda_language_flag -x=cu) + else() + set(cuda_language_flag) + endif() + + if( ${_cuda_source_format} MATCHES "OBJ") + set( cuda_compile_to_external_module OFF ) + else() + set( cuda_compile_to_external_module ON ) + if( ${_cuda_source_format} MATCHES "PTX" ) + set( cuda_compile_to_external_module_type "ptx" ) + elseif( ${_cuda_source_format} MATCHES "CUBIN") + set( cuda_compile_to_external_module_type "cubin" ) + elseif( ${_cuda_source_format} MATCHES "FATBIN") + set( cuda_compile_to_external_module_type "fatbin" ) + else() + message( FATAL_ERROR "Invalid format flag passed to CUDA_WRAP_SRCS or set with CUDA_SOURCE_PROPERTY_FORMAT file property for file '${file}': '${_cuda_source_format}'. Use OBJ, PTX, CUBIN or FATBIN.") + endif() + endif() + + if(cuda_compile_to_external_module) + # Don't use any of the host compilation flags for PTX targets. + set(CUDA_HOST_FLAGS) + set(CUDA_NVCC_FLAGS_CONFIG) + else() + set(CUDA_HOST_FLAGS ${_cuda_host_flags}) + set(CUDA_NVCC_FLAGS_CONFIG ${_cuda_nvcc_flags_config}) + endif() + + # Determine output directory + cuda_compute_build_path("${file}" cuda_build_path) + set(cuda_compile_intermediate_directory "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${cuda_target}.dir/${cuda_build_path}") + if(CUDA_GENERATED_OUTPUT_DIR) + set(cuda_compile_output_dir "${CUDA_GENERATED_OUTPUT_DIR}") + else() + if ( cuda_compile_to_external_module ) + set(cuda_compile_output_dir "${CMAKE_CURRENT_BINARY_DIR}") + else() + set(cuda_compile_output_dir "${cuda_compile_intermediate_directory}") + endif() + endif() + + # Add a custom target to generate a c or ptx file. ###################### + + get_filename_component( basename ${file} NAME ) + if( cuda_compile_to_external_module ) + set(generated_file_path "${cuda_compile_output_dir}") + set(generated_file_basename "${cuda_target}_generated_${basename}.${cuda_compile_to_external_module_type}") + set(format_flag "-${cuda_compile_to_external_module_type}") + file(MAKE_DIRECTORY "${cuda_compile_output_dir}") + else() + set(generated_file_path "${cuda_compile_output_dir}/${CMAKE_CFG_INTDIR}") + set(generated_file_basename "${cuda_target}_generated_${basename}${generated_extension}") + if(CUDA_SEPARABLE_COMPILATION) + set(format_flag "-dc") + else() + set(format_flag "-c") + endif() + endif() + + # Set all of our file names. Make sure that whatever filenames that have + # generated_file_path in them get passed in through as a command line + # argument, so that the ${CMAKE_CFG_INTDIR} gets expanded at run time + # instead of configure time. + set(generated_file "${generated_file_path}/${generated_file_basename}") + set(cmake_dependency_file "${cuda_compile_intermediate_directory}/${generated_file_basename}.depend") + set(NVCC_generated_dependency_file "${cuda_compile_intermediate_directory}/${generated_file_basename}.NVCC-depend") + set(generated_cubin_file "${generated_file_path}/${generated_file_basename}.cubin.txt") + set(custom_target_script "${cuda_compile_intermediate_directory}/${generated_file_basename}.cmake") + + # Setup properties for obj files: + if( NOT cuda_compile_to_external_module ) + set_source_files_properties("${generated_file}" + PROPERTIES + EXTERNAL_OBJECT true # This is an object file not to be compiled, but only be linked. + ) + endif() + + # Don't add CMAKE_CURRENT_SOURCE_DIR if the path is already an absolute path. + get_filename_component(file_path "${file}" PATH) + if(IS_ABSOLUTE "${file_path}") + set(source_file "${file}") + else() + set(source_file "${CMAKE_CURRENT_SOURCE_DIR}/${file}") + endif() + + if( NOT cuda_compile_to_external_module AND CUDA_SEPARABLE_COMPILATION) + list(APPEND ${cuda_target}_SEPARABLE_COMPILATION_OBJECTS "${generated_file}") + endif() + + # Bring in the dependencies. Creates a variable CUDA_NVCC_DEPEND ####### + cuda_include_nvcc_dependencies(${cmake_dependency_file}) + + # Convience string for output ########################################### + if(CUDA_BUILD_EMULATION) + set(cuda_build_type "Emulation") + else() + set(cuda_build_type "Device") + endif() + + # Build the NVCC made dependency file ################################### + set(build_cubin OFF) + if ( NOT CUDA_BUILD_EMULATION AND CUDA_BUILD_CUBIN ) + if ( NOT cuda_compile_to_external_module ) + set ( build_cubin ON ) + endif() + endif() + + # Configure the build script + configure_file("${CUDA_run_nvcc}" "${custom_target_script}" @ONLY) + + # So if a user specifies the same cuda file as input more than once, you + # can have bad things happen with dependencies. Here we check an option + # to see if this is the behavior they want. + if(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE) + set(main_dep MAIN_DEPENDENCY ${source_file}) + else() + set(main_dep DEPENDS ${source_file}) + endif() + + if(CUDA_VERBOSE_BUILD) + set(verbose_output ON) + elseif(CMAKE_GENERATOR MATCHES "Makefiles") + set(verbose_output "$(VERBOSE)") + else() + set(verbose_output OFF) + endif() + + # Create up the comment string + file(RELATIVE_PATH generated_file_relative_path "${CMAKE_BINARY_DIR}" "${generated_file}") + if(cuda_compile_to_external_module) + set(cuda_build_comment_string "Building NVCC ${cuda_compile_to_external_module_type} file ${generated_file_relative_path}") + else() + set(cuda_build_comment_string "Building NVCC (${cuda_build_type}) object ${generated_file_relative_path}") + endif() + + # Build the generated file and dependency file ########################## + add_custom_command( + OUTPUT ${generated_file} + # These output files depend on the source_file and the contents of cmake_dependency_file + ${main_dep} + DEPENDS ${CUDA_NVCC_DEPEND} + DEPENDS ${custom_target_script} + # Make sure the output directory exists before trying to write to it. + COMMAND ${CMAKE_COMMAND} -E make_directory "${generated_file_path}" + COMMAND ${CMAKE_COMMAND} ARGS + -D verbose:BOOL=${verbose_output} + ${ccbin_flags} + -D build_configuration:STRING=${CUDA_build_configuration} + -D "generated_file:STRING=${generated_file}" + -D "generated_cubin_file:STRING=${generated_cubin_file}" + -P "${custom_target_script}" + WORKING_DIRECTORY "${cuda_compile_intermediate_directory}" + COMMENT "${cuda_build_comment_string}" + ) + + # Make sure the build system knows the file is generated. + set_source_files_properties(${generated_file} PROPERTIES GENERATED TRUE) + + list(APPEND _cuda_wrap_generated_files ${generated_file}) + + # Add the other files that we want cmake to clean on a cleanup ########## + list(APPEND CUDA_ADDITIONAL_CLEAN_FILES "${cmake_dependency_file}") + list(REMOVE_DUPLICATES CUDA_ADDITIONAL_CLEAN_FILES) + set(CUDA_ADDITIONAL_CLEAN_FILES ${CUDA_ADDITIONAL_CLEAN_FILES} CACHE INTERNAL "List of intermediate files that are part of the cuda dependency scanning.") + + endif() + endforeach() + + # Set the return parameter + set(${generated_files} ${_cuda_wrap_generated_files}) +endmacro() + +function(_cuda_get_important_host_flags important_flags flag_string) + if(CMAKE_GENERATOR MATCHES "Visual Studio") + string(REGEX MATCHALL "/M[DT][d]?" flags "${flag_string}") + list(APPEND ${important_flags} ${flags}) + else() + string(REGEX MATCHALL "-fPIC" flags "${flag_string}") + list(APPEND ${important_flags} ${flags}) + endif() + set(${important_flags} ${${important_flags}} PARENT_SCOPE) +endfunction() + +############################################################################### +############################################################################### +# Separable Compilation Link +############################################################################### +############################################################################### + +# Compute the filename to be used by CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS +function(CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME output_file_var cuda_target object_files) + if (object_files) + set(generated_extension ${CMAKE_${CUDA_C_OR_CXX}_OUTPUT_EXTENSION}) + set(output_file "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${cuda_target}.dir/${CMAKE_CFG_INTDIR}/${cuda_target}_intermediate_link${generated_extension}") + else() + set(output_file) + endif() + + set(${output_file_var} "${output_file}" PARENT_SCOPE) +endfunction() + +# Setup the build rule for the separable compilation intermediate link file. +function(CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS output_file cuda_target options object_files) + if (object_files) + + set_source_files_properties("${output_file}" + PROPERTIES + EXTERNAL_OBJECT TRUE # This is an object file not to be compiled, but only + # be linked. + GENERATED TRUE # This file is generated during the build + ) + + # For now we are ignoring all the configuration specific flags. + set(nvcc_flags) + CUDA_PARSE_NVCC_OPTIONS(nvcc_flags ${options}) + if(CUDA_64_BIT_DEVICE_CODE) + list(APPEND nvcc_flags -m64) + else() + list(APPEND nvcc_flags -m32) + endif() + # If -ccbin, --compiler-bindir has been specified, don't do anything. Otherwise add it here. + list( FIND nvcc_flags "-ccbin" ccbin_found0 ) + list( FIND nvcc_flags "--compiler-bindir" ccbin_found1 ) + if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 AND CUDA_HOST_COMPILER ) + list(APPEND nvcc_flags -ccbin "\"${CUDA_HOST_COMPILER}\"") + endif() + + # Create a list of flags specified by CUDA_NVCC_FLAGS_${CONFIG} and CMAKE_${CUDA_C_OR_CXX}_FLAGS* + set(config_specific_flags) + set(flags) + foreach(config ${CUDA_configuration_types}) + string(TOUPPER ${config} config_upper) + # Add config specific flags + foreach(f ${CUDA_NVCC_FLAGS_${config_upper}}) + list(APPEND config_specific_flags $<$:${f}>) + endforeach() + set(important_host_flags) + _cuda_get_important_host_flags(important_host_flags "${CMAKE_${CUDA_C_OR_CXX}_FLAGS_${config_upper}}") + foreach(f ${important_host_flags}) + list(APPEND flags $<$:-Xcompiler> $<$:${f}>) + endforeach() + endforeach() + # Add CMAKE_${CUDA_C_OR_CXX}_FLAGS + set(important_host_flags) + _cuda_get_important_host_flags(important_host_flags "${CMAKE_${CUDA_C_OR_CXX}_FLAGS}") + foreach(f ${important_host_flags}) + list(APPEND flags -Xcompiler ${f}) + endforeach() + + # Add our general CUDA_NVCC_FLAGS with the configuration specifig flags + set(nvcc_flags ${CUDA_NVCC_FLAGS} ${config_specific_flags} ${nvcc_flags}) + + file(RELATIVE_PATH output_file_relative_path "${CMAKE_BINARY_DIR}" "${output_file}") + + # Some generators don't handle the multiple levels of custom command + # dependencies correctly (obj1 depends on file1, obj2 depends on obj1), so + # we work around that issue by compiling the intermediate link object as a + # pre-link custom command in that situation. + set(do_obj_build_rule TRUE) + if (MSVC_VERSION GREATER 1599) + # VS 2010 and 2012 have this problem. If future versions fix this issue, + # it should still work, it just won't be as nice as the other method. + set(do_obj_build_rule FALSE) + endif() + + if (do_obj_build_rule) + add_custom_command( + OUTPUT ${output_file} + DEPENDS ${object_files} + COMMAND ${CUDA_NVCC_EXECUTABLE} ${nvcc_flags} -dlink ${object_files} -o ${output_file} + ${flags} + COMMENT "Building NVCC intermediate link file ${output_file_relative_path}" + ) + else() + get_filename_component(output_file_dir "${output_file}" DIRECTORY) + add_custom_command( + TARGET ${cuda_target} + PRE_LINK + COMMAND ${CMAKE_COMMAND} -E echo "Building NVCC intermediate link file ${output_file_relative_path}" + COMMAND ${CMAKE_COMMAND} -E make_directory "${output_file_dir}" + COMMAND ${CUDA_NVCC_EXECUTABLE} ${nvcc_flags} ${flags} -dlink ${object_files} -o "${output_file}" + ) + endif() + endif() +endfunction() + +############################################################################### +############################################################################### +# ADD LIBRARY +############################################################################### +############################################################################### +macro(CUDA_ADD_LIBRARY cuda_target) + + CUDA_ADD_CUDA_INCLUDE_ONCE() + + # Separate the sources from the options + CUDA_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _options ${ARGN}) + CUDA_BUILD_SHARED_LIBRARY(_cuda_shared_flag ${ARGN}) + # Create custom commands and targets for each file. + CUDA_WRAP_SRCS( ${cuda_target} OBJ _generated_files ${_sources} + ${_cmake_options} ${_cuda_shared_flag} + OPTIONS ${_options} ) + + # Compute the file name of the intermedate link file used for separable + # compilation. + CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME(link_file ${cuda_target} "${${cuda_target}_SEPARABLE_COMPILATION_OBJECTS}") + + # Add the library. + add_library(${cuda_target} ${_cmake_options} + ${_generated_files} + ${_sources} + ${link_file} + ) + + # Add a link phase for the separable compilation if it has been enabled. If + # it has been enabled then the ${cuda_target}_SEPARABLE_COMPILATION_OBJECTS + # variable will have been defined. + CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS("${link_file}" ${cuda_target} "${_options}" "${${cuda_target}_SEPARABLE_COMPILATION_OBJECTS}") + + target_link_libraries(${cuda_target} + ${CUDA_LIBRARIES} + ) + + # We need to set the linker language based on what the expected generated file + # would be. CUDA_C_OR_CXX is computed based on CUDA_HOST_COMPILATION_CPP. + set_target_properties(${cuda_target} + PROPERTIES + LINKER_LANGUAGE ${CUDA_C_OR_CXX} + ) + +endmacro() + + +############################################################################### +############################################################################### +# ADD EXECUTABLE +############################################################################### +############################################################################### +macro(CUDA_ADD_EXECUTABLE cuda_target) + + CUDA_ADD_CUDA_INCLUDE_ONCE() + + # Separate the sources from the options + CUDA_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _options ${ARGN}) + # Create custom commands and targets for each file. + CUDA_WRAP_SRCS( ${cuda_target} OBJ _generated_files ${_sources} OPTIONS ${_options} ) + + # Compute the file name of the intermedate link file used for separable + # compilation. + CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME(link_file ${cuda_target} "${${cuda_target}_SEPARABLE_COMPILATION_OBJECTS}") + + # Add the library. + add_executable(${cuda_target} ${_cmake_options} + ${_generated_files} + ${_sources} + ${link_file} + ) + + # Add a link phase for the separable compilation if it has been enabled. If + # it has been enabled then the ${cuda_target}_SEPARABLE_COMPILATION_OBJECTS + # variable will have been defined. + CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS("${link_file}" ${cuda_target} "${_options}" "${${cuda_target}_SEPARABLE_COMPILATION_OBJECTS}") + + target_link_libraries(${cuda_target} + ${CUDA_LIBRARIES} + ) + + # We need to set the linker language based on what the expected generated file + # would be. CUDA_C_OR_CXX is computed based on CUDA_HOST_COMPILATION_CPP. + set_target_properties(${cuda_target} + PROPERTIES + LINKER_LANGUAGE ${CUDA_C_OR_CXX} + ) + +endmacro() + + +############################################################################### +############################################################################### +# (Internal) helper for manually added cuda source files with specific targets +############################################################################### +############################################################################### +macro(cuda_compile_base cuda_target format generated_files) + + # Separate the sources from the options + CUDA_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _options ${ARGN}) + # Create custom commands and targets for each file. + CUDA_WRAP_SRCS( ${cuda_target} ${format} _generated_files ${_sources} ${_cmake_options} + OPTIONS ${_options} ) + + set( ${generated_files} ${_generated_files}) + +endmacro() + +############################################################################### +############################################################################### +# CUDA COMPILE +############################################################################### +############################################################################### +macro(CUDA_COMPILE generated_files) + cuda_compile_base(cuda_compile OBJ ${generated_files} ${ARGN}) +endmacro() + +############################################################################### +############################################################################### +# CUDA COMPILE PTX +############################################################################### +############################################################################### +macro(CUDA_COMPILE_PTX generated_files) + cuda_compile_base(cuda_compile_ptx PTX ${generated_files} ${ARGN}) +endmacro() + +############################################################################### +############################################################################### +# CUDA COMPILE FATBIN +############################################################################### +############################################################################### +macro(CUDA_COMPILE_FATBIN generated_files) + cuda_compile_base(cuda_compile_fatbin FATBIN ${generated_files} ${ARGN}) +endmacro() + +############################################################################### +############################################################################### +# CUDA COMPILE CUBIN +############################################################################### +############################################################################### +macro(CUDA_COMPILE_CUBIN generated_files) + cuda_compile_base(cuda_compile_cubin CUBIN ${generated_files} ${ARGN}) +endmacro() + + +############################################################################### +############################################################################### +# CUDA ADD CUFFT TO TARGET +############################################################################### +############################################################################### +macro(CUDA_ADD_CUFFT_TO_TARGET target) + if (CUDA_BUILD_EMULATION) + target_link_libraries(${target} ${CUDA_cufftemu_LIBRARY}) + else() + target_link_libraries(${target} ${CUDA_cufft_LIBRARY}) + endif() +endmacro() + +############################################################################### +############################################################################### +# CUDA ADD CUBLAS TO TARGET +############################################################################### +############################################################################### +macro(CUDA_ADD_CUBLAS_TO_TARGET target) + if (CUDA_BUILD_EMULATION) + target_link_libraries(${target} ${CUDA_cublasemu_LIBRARY}) + else() + target_link_libraries(${target} ${CUDA_cublas_LIBRARY}) + endif() +endmacro() + +############################################################################### +############################################################################### +# CUDA BUILD CLEAN TARGET +############################################################################### +############################################################################### +macro(CUDA_BUILD_CLEAN_TARGET) + # Call this after you add all your CUDA targets, and you will get a convience + # target. You should also make clean after running this target to get the + # build system to generate all the code again. + + set(cuda_clean_target_name clean_cuda_depends) + if (CMAKE_GENERATOR MATCHES "Visual Studio") + string(TOUPPER ${cuda_clean_target_name} cuda_clean_target_name) + endif() + add_custom_target(${cuda_clean_target_name} + COMMAND ${CMAKE_COMMAND} -E remove ${CUDA_ADDITIONAL_CLEAN_FILES}) + + # Clear out the variable, so the next time we configure it will be empty. + # This is useful so that the files won't persist in the list after targets + # have been removed. + set(CUDA_ADDITIONAL_CLEAN_FILES "" CACHE INTERNAL "List of intermediate files that are part of the cuda dependency scanning.") +endmacro() diff --git a/cmake/FindCUDA/make2cmake.cmake b/cmake/FindCUDA/make2cmake.cmake new file mode 100644 index 0000000..c433fa8 --- /dev/null +++ b/cmake/FindCUDA/make2cmake.cmake @@ -0,0 +1,92 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# Copyright (c) 2007-2009 +# Scientific Computing and Imaging Institute, University of Utah +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# + +####################################################################### +# This converts a file written in makefile syntax into one that can be included +# by CMake. + +file(READ ${input_file} depend_text) + +if (NOT "${depend_text}" STREQUAL "") + + # message("FOUND DEPENDS") + + string(REPLACE "\\ " " " depend_text ${depend_text}) + + # This works for the nvcc -M generated dependency files. + string(REGEX REPLACE "^.* : " "" depend_text ${depend_text}) + string(REGEX REPLACE "[ \\\\]*\n" ";" depend_text ${depend_text}) + + set(dependency_list "") + + foreach(file ${depend_text}) + + string(REGEX REPLACE "^ +" "" file ${file}) + + # OK, now if we had a UNC path, nvcc has a tendency to only output the first '/' + # instead of '//'. Here we will test to see if the file exists, if it doesn't then + # try to prepend another '/' to the path and test again. If it still fails remove the + # path. + + if(NOT EXISTS "${file}") + if (EXISTS "/${file}") + set(file "/${file}") + else() + message(WARNING " Removing non-existent dependency file: ${file}") + set(file "") + endif() + endif() + + if(NOT IS_DIRECTORY "${file}") + # If softlinks start to matter, we should change this to REALPATH. For now we need + # to flatten paths, because nvcc can generate stuff like /bin/../include instead of + # just /include. + get_filename_component(file_absolute "${file}" ABSOLUTE) + list(APPEND dependency_list "${file_absolute}") + endif() + + endforeach() + +else() + # message("FOUND NO DEPENDS") +endif() + +# Remove the duplicate entries and sort them. +list(REMOVE_DUPLICATES dependency_list) +list(SORT dependency_list) + +foreach(file ${dependency_list}) + set(cuda_nvcc_depend "${cuda_nvcc_depend} \"${file}\"\n") +endforeach() + +file(WRITE ${output_file} "# Generated by: make2cmake.cmake\nSET(CUDA_NVCC_DEPEND\n ${cuda_nvcc_depend})\n\n") diff --git a/cmake/FindCUDA/parse_cubin.cmake b/cmake/FindCUDA/parse_cubin.cmake new file mode 100644 index 0000000..626c8a2 --- /dev/null +++ b/cmake/FindCUDA/parse_cubin.cmake @@ -0,0 +1,111 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# Copyright (c) 2007-2009 +# Scientific Computing and Imaging Institute, University of Utah +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# + +####################################################################### +# Parses a .cubin file produced by nvcc and reports statistics about the file. + + +file(READ ${input_file} file_text) + +if (NOT "${file_text}" STREQUAL "") + + string(REPLACE ";" "\\;" file_text ${file_text}) + string(REPLACE "\ncode" ";code" file_text ${file_text}) + + list(LENGTH file_text len) + + foreach(line ${file_text}) + + # Only look at "code { }" blocks. + if(line MATCHES "^code") + + # Break into individual lines. + string(REGEX REPLACE "\n" ";" line ${line}) + + foreach(entry ${line}) + + # Extract kernel names. + if (${entry} MATCHES "[^g]name = ([^ ]+)") + set(entry "${CMAKE_MATCH_1}") + + # Check to see if the kernel name starts with "_" + set(skip FALSE) + # if (${entry} MATCHES "^_") + # Skip the rest of this block. + # message("Skipping ${entry}") + # set(skip TRUE) + # else () + message("Kernel: ${entry}") + # endif () + + endif() + + # Skip the rest of the block if necessary + if(NOT skip) + + # Registers + if (${entry} MATCHES "reg([ ]+)=([ ]+)([^ ]+)") + set(entry "${CMAKE_MATCH_3}") + message("Registers: ${entry}") + endif() + + # Local memory + if (${entry} MATCHES "lmem([ ]+)=([ ]+)([^ ]+)") + set(entry "${CMAKE_MATCH_3}") + message("Local: ${entry}") + endif() + + # Shared memory + if (${entry} MATCHES "smem([ ]+)=([ ]+)([^ ]+)") + set(entry "${CMAKE_MATCH_3}") + message("Shared: ${entry}") + endif() + + if (${entry} MATCHES "^}") + message("") + endif() + + endif() + + + endforeach() + + endif() + + endforeach() + +else() + # message("FOUND NO DEPENDS") +endif() + + diff --git a/cmake/FindCUDA/run_nvcc.cmake b/cmake/FindCUDA/run_nvcc.cmake new file mode 100644 index 0000000..abdd307 --- /dev/null +++ b/cmake/FindCUDA/run_nvcc.cmake @@ -0,0 +1,288 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "@CMAKE_COMMAND@") # path +set(source_file "@source_file@") # path +set(NVCC_generated_dependency_file "@NVCC_generated_dependency_file@") # path +set(cmake_dependency_file "@cmake_dependency_file@") # path +set(CUDA_make2cmake "@CUDA_make2cmake@") # path +set(CUDA_parse_cubin "@CUDA_parse_cubin@") # path +set(build_cubin @build_cubin@) # bool +set(CUDA_HOST_COMPILER "@CUDA_HOST_COMPILER@") # path +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "@generated_file_path@") # path +set(generated_file_internal "@generated_file@") # path +set(generated_cubin_file_internal "@generated_cubin_file@") # path + +set(CUDA_NVCC_EXECUTABLE "@CUDA_NVCC_EXECUTABLE@") # path +set(CUDA_NVCC_FLAGS @CUDA_NVCC_FLAGS@ ;; @CUDA_WRAP_OPTION_NVCC_FLAGS@) # list +@CUDA_NVCC_FLAGS_CONFIG@ +set(nvcc_flags @nvcc_flags@) # list +set(CUDA_NVCC_INCLUDE_ARGS "@CUDA_NVCC_INCLUDE_ARGS@") # list (needs to be in quotes to handle spaces properly). +set(format_flag "@format_flag@") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +@CUDA_HOST_FLAGS@ + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 AND CUDA_HOST_COMPILER ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT "x${_command}" STREQUAL "xCOMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION @CUDA_VERSION@) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/cmake/FindPackageHandleStandardArgs.cmake b/cmake/FindPackageHandleStandardArgs.cmake new file mode 100644 index 0000000..2de1fb3 --- /dev/null +++ b/cmake/FindPackageHandleStandardArgs.cmake @@ -0,0 +1,382 @@ +#.rst: +# FindPackageHandleStandardArgs +# ----------------------------- +# +# +# +# FIND_PACKAGE_HANDLE_STANDARD_ARGS( ... ) +# +# This function is intended to be used in FindXXX.cmake modules files. +# It handles the REQUIRED, QUIET and version-related arguments to +# find_package(). It also sets the _FOUND variable. The +# package is considered found if all variables ... listed contain +# valid results, e.g. valid filepaths. +# +# There are two modes of this function. The first argument in both +# modes is the name of the Find-module where it is called (in original +# casing). +# +# The first simple mode looks like this: +# +# :: +# +# FIND_PACKAGE_HANDLE_STANDARD_ARGS( +# (DEFAULT_MSG|"Custom failure message") ... ) +# +# If the variables to are all valid, then +# _FOUND will be set to TRUE. If DEFAULT_MSG is given +# as second argument, then the function will generate itself useful +# success and error messages. You can also supply a custom error +# message for the failure case. This is not recommended. +# +# The second mode is more powerful and also supports version checking: +# +# :: +# +# FIND_PACKAGE_HANDLE_STANDARD_ARGS(NAME +# [FOUND_VAR ] +# [REQUIRED_VARS ...] +# [VERSION_VAR ] +# [HANDLE_COMPONENTS] +# [CONFIG_MODE] +# [FAIL_MESSAGE "Custom failure message"] ) +# +# In this mode, the name of the result-variable can be set either to +# either _FOUND or _FOUND using the +# FOUND_VAR option. Other names for the result-variable are not +# allowed. So for a Find-module named FindFooBar.cmake, the two +# possible names are FooBar_FOUND and FOOBAR_FOUND. It is recommended +# to use the original case version. If the FOUND_VAR option is not +# used, the default is _FOUND. +# +# As in the simple mode, if through are all valid, +# _FOUND will be set to TRUE. After REQUIRED_VARS the +# variables which are required for this package are listed. Following +# VERSION_VAR the name of the variable can be specified which holds the +# version of the package which has been found. If this is done, this +# version will be checked against the (potentially) specified required +# version used in the find_package() call. The EXACT keyword is also +# handled. The default messages include information about the required +# version and the version which has been actually found, both if the +# version is ok or not. If the package supports components, use the +# HANDLE_COMPONENTS option to enable handling them. In this case, +# find_package_handle_standard_args() will report which components have +# been found and which are missing, and the _FOUND variable +# will be set to FALSE if any of the required components (i.e. not the +# ones listed after OPTIONAL_COMPONENTS) are missing. Use the option +# CONFIG_MODE if your FindXXX.cmake module is a wrapper for a +# find_package(... NO_MODULE) call. In this case VERSION_VAR will be +# set to _VERSION and the macro will automatically check whether +# the Config module was found. Via FAIL_MESSAGE a custom failure +# message can be specified, if this is not used, the default message +# will be displayed. +# +# Example for mode 1: +# +# :: +# +# find_package_handle_standard_args(LibXml2 DEFAULT_MSG +# LIBXML2_LIBRARY LIBXML2_INCLUDE_DIR) +# +# +# +# LibXml2 is considered to be found, if both LIBXML2_LIBRARY and +# LIBXML2_INCLUDE_DIR are valid. Then also LIBXML2_FOUND is set to +# TRUE. If it is not found and REQUIRED was used, it fails with +# FATAL_ERROR, independent whether QUIET was used or not. If it is +# found, success will be reported, including the content of . On +# repeated Cmake runs, the same message won't be printed again. +# +# Example for mode 2: +# +# :: +# +# find_package_handle_standard_args(LibXslt +# FOUND_VAR LibXslt_FOUND +# REQUIRED_VARS LibXslt_LIBRARIES LibXslt_INCLUDE_DIRS +# VERSION_VAR LibXslt_VERSION_STRING) +# +# In this case, LibXslt is considered to be found if the variable(s) +# listed after REQUIRED_VAR are all valid, i.e. LibXslt_LIBRARIES and +# LibXslt_INCLUDE_DIRS in this case. The result will then be stored in +# LibXslt_FOUND . Also the version of LibXslt will be checked by using +# the version contained in LibXslt_VERSION_STRING. Since no +# FAIL_MESSAGE is given, the default messages will be printed. +# +# Another example for mode 2: +# +# :: +# +# find_package(Automoc4 QUIET NO_MODULE HINTS /opt/automoc4) +# find_package_handle_standard_args(Automoc4 CONFIG_MODE) +# +# In this case, FindAutmoc4.cmake wraps a call to find_package(Automoc4 +# NO_MODULE) and adds an additional search directory for automoc4. Here +# the result will be stored in AUTOMOC4_FOUND. The following +# FIND_PACKAGE_HANDLE_STANDARD_ARGS() call produces a proper +# success/error message. + +#============================================================================= +# Copyright 2007-2009 Kitware, Inc. +# +# Distributed under the OSI-approved BSD License (the "License"); +# see accompanying file Copyright.txt for details. +# +# This software is distributed WITHOUT ANY WARRANTY; without even the +# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the License for more information. +#============================================================================= +# (To distribute this file outside of CMake, substitute the full +# License text for the above reference.) + +include(${CMAKE_CURRENT_LIST_DIR}/FindPackageMessage.cmake) +include(${CMAKE_CURRENT_LIST_DIR}/CMakeParseArguments.cmake) + +# internal helper macro +macro(_FPHSA_FAILURE_MESSAGE _msg) + if (${_NAME}_FIND_REQUIRED) + message(FATAL_ERROR "${_msg}") + else () + if (NOT ${_NAME}_FIND_QUIETLY) + message(STATUS "${_msg}") + endif () + endif () +endmacro() + + +# internal helper macro to generate the failure message when used in CONFIG_MODE: +macro(_FPHSA_HANDLE_FAILURE_CONFIG_MODE) + # _CONFIG is set, but FOUND is false, this means that some other of the REQUIRED_VARS was not found: + if(${_NAME}_CONFIG) + _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE}: missing: ${MISSING_VARS} (found ${${_NAME}_CONFIG} ${VERSION_MSG})") + else() + # If _CONSIDERED_CONFIGS is set, the config-file has been found, but no suitable version. + # List them all in the error message: + if(${_NAME}_CONSIDERED_CONFIGS) + set(configsText "") + list(LENGTH ${_NAME}_CONSIDERED_CONFIGS configsCount) + math(EXPR configsCount "${configsCount} - 1") + foreach(currentConfigIndex RANGE ${configsCount}) + list(GET ${_NAME}_CONSIDERED_CONFIGS ${currentConfigIndex} filename) + list(GET ${_NAME}_CONSIDERED_VERSIONS ${currentConfigIndex} version) + set(configsText "${configsText} ${filename} (version ${version})\n") + endforeach() + if (${_NAME}_NOT_FOUND_MESSAGE) + set(configsText "${configsText} Reason given by package: ${${_NAME}_NOT_FOUND_MESSAGE}\n") + endif() + _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE} ${VERSION_MSG}, checked the following files:\n${configsText}") + + else() + # Simple case: No Config-file was found at all: + _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE}: found neither ${_NAME}Config.cmake nor ${_NAME_LOWER}-config.cmake ${VERSION_MSG}") + endif() + endif() +endmacro() + + +function(FIND_PACKAGE_HANDLE_STANDARD_ARGS _NAME _FIRST_ARG) + +# set up the arguments for CMAKE_PARSE_ARGUMENTS and check whether we are in +# new extended or in the "old" mode: + set(options CONFIG_MODE HANDLE_COMPONENTS) + set(oneValueArgs FAIL_MESSAGE VERSION_VAR FOUND_VAR) + set(multiValueArgs REQUIRED_VARS) + set(_KEYWORDS_FOR_EXTENDED_MODE ${options} ${oneValueArgs} ${multiValueArgs} ) + list(FIND _KEYWORDS_FOR_EXTENDED_MODE "${_FIRST_ARG}" INDEX) + + if(${INDEX} EQUAL -1) + set(FPHSA_FAIL_MESSAGE ${_FIRST_ARG}) + set(FPHSA_REQUIRED_VARS ${ARGN}) + set(FPHSA_VERSION_VAR) + else() + + CMAKE_PARSE_ARGUMENTS(FPHSA "${options}" "${oneValueArgs}" "${multiValueArgs}" ${_FIRST_ARG} ${ARGN}) + + if(FPHSA_UNPARSED_ARGUMENTS) + message(FATAL_ERROR "Unknown keywords given to FIND_PACKAGE_HANDLE_STANDARD_ARGS(): \"${FPHSA_UNPARSED_ARGUMENTS}\"") + endif() + + if(NOT FPHSA_FAIL_MESSAGE) + set(FPHSA_FAIL_MESSAGE "DEFAULT_MSG") + endif() + endif() + +# now that we collected all arguments, process them + + if("x${FPHSA_FAIL_MESSAGE}" STREQUAL "xDEFAULT_MSG") + set(FPHSA_FAIL_MESSAGE "Could NOT find ${_NAME}") + endif() + + # In config-mode, we rely on the variable _CONFIG, which is set by find_package() + # when it successfully found the config-file, including version checking: + if(FPHSA_CONFIG_MODE) + list(INSERT FPHSA_REQUIRED_VARS 0 ${_NAME}_CONFIG) + list(REMOVE_DUPLICATES FPHSA_REQUIRED_VARS) + set(FPHSA_VERSION_VAR ${_NAME}_VERSION) + endif() + + if(NOT FPHSA_REQUIRED_VARS) + message(FATAL_ERROR "No REQUIRED_VARS specified for FIND_PACKAGE_HANDLE_STANDARD_ARGS()") + endif() + + list(GET FPHSA_REQUIRED_VARS 0 _FIRST_REQUIRED_VAR) + + string(TOUPPER ${_NAME} _NAME_UPPER) + string(TOLOWER ${_NAME} _NAME_LOWER) + + if(FPHSA_FOUND_VAR) + if(FPHSA_FOUND_VAR MATCHES "^${_NAME}_FOUND$" OR FPHSA_FOUND_VAR MATCHES "^${_NAME_UPPER}_FOUND$") + set(_FOUND_VAR ${FPHSA_FOUND_VAR}) + else() + message(FATAL_ERROR "The argument for FOUND_VAR is \"${FPHSA_FOUND_VAR}\", but only \"${_NAME}_FOUND\" and \"${_NAME_UPPER}_FOUND\" are valid names.") + endif() + else() + set(_FOUND_VAR ${_NAME_UPPER}_FOUND) + endif() + + # collect all variables which were not found, so they can be printed, so the + # user knows better what went wrong (#6375) + set(MISSING_VARS "") + set(DETAILS "") + # check if all passed variables are valid + unset(${_FOUND_VAR}) + foreach(_CURRENT_VAR ${FPHSA_REQUIRED_VARS}) + if(NOT ${_CURRENT_VAR}) + set(${_FOUND_VAR} FALSE) + set(MISSING_VARS "${MISSING_VARS} ${_CURRENT_VAR}") + else() + set(DETAILS "${DETAILS}[${${_CURRENT_VAR}}]") + endif() + endforeach() + if(NOT "${${_FOUND_VAR}}" STREQUAL "FALSE") + set(${_FOUND_VAR} TRUE) + endif() + + # component handling + unset(FOUND_COMPONENTS_MSG) + unset(MISSING_COMPONENTS_MSG) + + if(FPHSA_HANDLE_COMPONENTS) + foreach(comp ${${_NAME}_FIND_COMPONENTS}) + if(${_NAME}_${comp}_FOUND) + + if(NOT DEFINED FOUND_COMPONENTS_MSG) + set(FOUND_COMPONENTS_MSG "found components: ") + endif() + set(FOUND_COMPONENTS_MSG "${FOUND_COMPONENTS_MSG} ${comp}") + + else() + + if(NOT DEFINED MISSING_COMPONENTS_MSG) + set(MISSING_COMPONENTS_MSG "missing components: ") + endif() + set(MISSING_COMPONENTS_MSG "${MISSING_COMPONENTS_MSG} ${comp}") + + if(${_NAME}_FIND_REQUIRED_${comp}) + set(${_FOUND_VAR} FALSE) + set(MISSING_VARS "${MISSING_VARS} ${comp}") + endif() + + endif() + endforeach() + set(COMPONENT_MSG "${FOUND_COMPONENTS_MSG} ${MISSING_COMPONENTS_MSG}") + set(DETAILS "${DETAILS}[c${COMPONENT_MSG}]") + endif() + + # version handling: + set(VERSION_MSG "") + set(VERSION_OK TRUE) + set(VERSION ${${FPHSA_VERSION_VAR}}) + + # check with DEFINED here as the requested or found version may be "0" + if (DEFINED ${_NAME}_FIND_VERSION) + if(DEFINED ${FPHSA_VERSION_VAR}) + + if(${_NAME}_FIND_VERSION_EXACT) # exact version required + # count the dots in the version string + string(REGEX REPLACE "[^.]" "" _VERSION_DOTS "${VERSION}") + # add one dot because there is one dot more than there are components + string(LENGTH "${_VERSION_DOTS}." _VERSION_DOTS) + if (_VERSION_DOTS GREATER ${_NAME}_FIND_VERSION_COUNT) + # Because of the C++ implementation of find_package() ${_NAME}_FIND_VERSION_COUNT + # is at most 4 here. Therefore a simple lookup table is used. + if (${_NAME}_FIND_VERSION_COUNT EQUAL 1) + set(_VERSION_REGEX "[^.]*") + elseif (${_NAME}_FIND_VERSION_COUNT EQUAL 2) + set(_VERSION_REGEX "[^.]*\\.[^.]*") + elseif (${_NAME}_FIND_VERSION_COUNT EQUAL 3) + set(_VERSION_REGEX "[^.]*\\.[^.]*\\.[^.]*") + else () + set(_VERSION_REGEX "[^.]*\\.[^.]*\\.[^.]*\\.[^.]*") + endif () + string(REGEX REPLACE "^(${_VERSION_REGEX})\\..*" "\\1" _VERSION_HEAD "${VERSION}") + unset(_VERSION_REGEX) + if (NOT ${_NAME}_FIND_VERSION VERSION_EQUAL _VERSION_HEAD) + set(VERSION_MSG "Found unsuitable version \"${VERSION}\", but required is exact version \"${${_NAME}_FIND_VERSION}\"") + set(VERSION_OK FALSE) + else () + set(VERSION_MSG "(found suitable exact version \"${VERSION}\")") + endif () + unset(_VERSION_HEAD) + else () + if (NOT ${_NAME}_FIND_VERSION VERSION_EQUAL VERSION) + set(VERSION_MSG "Found unsuitable version \"${VERSION}\", but required is exact version \"${${_NAME}_FIND_VERSION}\"") + set(VERSION_OK FALSE) + else () + set(VERSION_MSG "(found suitable exact version \"${VERSION}\")") + endif () + endif () + unset(_VERSION_DOTS) + + else() # minimum version specified: + if (${_NAME}_FIND_VERSION VERSION_GREATER VERSION) + set(VERSION_MSG "Found unsuitable version \"${VERSION}\", but required is at least \"${${_NAME}_FIND_VERSION}\"") + set(VERSION_OK FALSE) + else () + set(VERSION_MSG "(found suitable version \"${VERSION}\", minimum required is \"${${_NAME}_FIND_VERSION}\")") + endif () + endif() + + else() + + # if the package was not found, but a version was given, add that to the output: + if(${_NAME}_FIND_VERSION_EXACT) + set(VERSION_MSG "(Required is exact version \"${${_NAME}_FIND_VERSION}\")") + else() + set(VERSION_MSG "(Required is at least version \"${${_NAME}_FIND_VERSION}\")") + endif() + + endif() + else () + if(VERSION) + set(VERSION_MSG "(found version \"${VERSION}\")") + endif() + endif () + + if(VERSION_OK) + set(DETAILS "${DETAILS}[v${VERSION}(${${_NAME}_FIND_VERSION})]") + else() + set(${_FOUND_VAR} FALSE) + endif() + + + # print the result: + if (${_FOUND_VAR}) + FIND_PACKAGE_MESSAGE(${_NAME} "Found ${_NAME}: ${${_FIRST_REQUIRED_VAR}} ${VERSION_MSG} ${COMPONENT_MSG}" "${DETAILS}") + else () + + if(FPHSA_CONFIG_MODE) + _FPHSA_HANDLE_FAILURE_CONFIG_MODE() + else() + if(NOT VERSION_OK) + _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE}: ${VERSION_MSG} (found ${${_FIRST_REQUIRED_VAR}})") + else() + _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE} (missing: ${MISSING_VARS}) ${VERSION_MSG}") + endif() + endif() + + endif () + + set(${_FOUND_VAR} ${${_FOUND_VAR}} PARENT_SCOPE) + +endfunction() diff --git a/cmake/FindPackageMessage.cmake b/cmake/FindPackageMessage.cmake new file mode 100644 index 0000000..a0349d3 --- /dev/null +++ b/cmake/FindPackageMessage.cmake @@ -0,0 +1,57 @@ +#.rst: +# FindPackageMessage +# ------------------ +# +# +# +# FIND_PACKAGE_MESSAGE( "message for user" "find result details") +# +# This macro is intended to be used in FindXXX.cmake modules files. It +# will print a message once for each unique find result. This is useful +# for telling the user where a package was found. The first argument +# specifies the name (XXX) of the package. The second argument +# specifies the message to display. The third argument lists details +# about the find result so that if they change the message will be +# displayed again. The macro also obeys the QUIET argument to the +# find_package command. +# +# Example: +# +# :: +# +# if(X11_FOUND) +# FIND_PACKAGE_MESSAGE(X11 "Found X11: ${X11_X11_LIB}" +# "[${X11_X11_LIB}][${X11_INCLUDE_DIR}]") +# else() +# ... +# endif() + +#============================================================================= +# Copyright 2008-2009 Kitware, Inc. +# +# Distributed under the OSI-approved BSD License (the "License"); +# see accompanying file Copyright.txt for details. +# +# This software is distributed WITHOUT ANY WARRANTY; without even the +# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the License for more information. +#============================================================================= +# (To distribute this file outside of CMake, substitute the full +# License text for the above reference.) + +function(FIND_PACKAGE_MESSAGE pkg msg details) + # Avoid printing a message repeatedly for the same find result. + if(NOT ${pkg}_FIND_QUIETLY) + string(REPLACE "\n" "" details "${details}") + set(DETAILS_VAR FIND_PACKAGE_MESSAGE_DETAILS_${pkg}) + if(NOT "${details}" STREQUAL "${${DETAILS_VAR}}") + # The message has not yet been printed. + message(STATUS "${msg}") + + # Save the find details in the cache to avoid printing the same + # message again. + set("${DETAILS_VAR}" "${details}" + CACHE INTERNAL "Details about finding ${pkg}") + endif() + endif() +endfunction() diff --git a/favicon.ico b/favicon.ico deleted file mode 100644 index b4079bd..0000000 Binary files a/favicon.ico and /dev/null differ diff --git a/fftw_example/a.out b/fftw_example/a.out new file mode 100755 index 0000000..e9091a6 Binary files /dev/null and b/fftw_example/a.out differ diff --git a/fftw_example/undees_fftw_exmple.c b/fftw_example/undees_fftw_exmple.c new file mode 100644 index 0000000..fee3060 --- /dev/null +++ b/fftw_example/undees_fftw_exmple.c @@ -0,0 +1,108 @@ +/* Start reading here */ + +#include +#include +#include +#include + + +/* Never mind this bit */ + +#include +#include + +#define REAL 0 +#define IMAG 1 + +int NUM_POINTS = 0x1; + +void acquire_from_somewhere(fftw_complex* signal) { + /* Generate two sine waves of different frequencies and + * amplitudes. + */ + + int i; + for (i = 0; i < NUM_POINTS; ++i) { + double theta = (double)i / (double)NUM_POINTS * M_PI; + + signal[i][REAL] = 1.0 * cos(10.0 * theta) + + 0.5 * cos(25.0 * theta); + + signal[i][IMAG] = 1.0 * sin(10.0 * theta) + + 0.5 * sin(25.0 * theta); + } + printf("SAMPLES\n"); + for (i = 0; i < NUM_POINTS; ++i) { + double mag = sqrt(signal[i][REAL] * signal[i][REAL] + + signal[i][IMAG] * signal[i][IMAG]); + + printf("%g\n", mag); + } +} + +void do_something_with(fftw_complex* result) { + printf("RESULTS\n"); + int i; + for (i = 0; i < NUM_POINTS; ++i) { + double mag = sqrt(result[i][REAL] * result[i][REAL] + + result[i][IMAG] * result[i][IMAG]); + + printf("%g\n", mag); + } +} + + +/* Resume reading here */ + +int main() { + + LARGE_INTEGER StartingTime, EndingTime, ElapsedMicroseconds; + LARGE_INTEGER Frequency; + +// +// We now have the elapsed number of ticks, along with the +// number of ticks-per-second. We use these values +// to convert to the number of elapsed microseconds. +// To guard against loss-of-precision, we convert +// to microseconds *before* dividing by ticks-per-second. +// + +ElapsedMicroseconds.QuadPart *= 1000000; +ElapsedMicroseconds.QuadPart /= Frequency.QuadPart; + + while (NUM_POINTS < (0x1 << 12)) + { + + char a = getc(stdin); + + NUM_POINTS = NUM_POINTS << 1; + fftw_complex signal[NUM_POINTS]; + fftw_complex result[NUM_POINTS]; + + fftw_plan plan = fftw_plan_dft_1d(NUM_POINTS, + signal, + result, + FFTW_FORWARD, + FFTW_ESTIMATE); + + acquire_from_somewhere(signal); + + QueryPerformanceFrequency(&Frequency); + QueryPerformanceCounter(&StartingTime); + + fftw_execute(plan); + + QueryPerformanceCounter(&EndingTime); + ElapsedMicroseconds.QuadPart = EndingTime.QuadPart - StartingTime.QuadPart; + ElapsedMicroseconds.QuadPart *= 1000000; + ElapsedMicroseconds.QuadPart /= Frequency.QuadPart; + + do_something_with(result); + + printf("Elapsed microseconds: %lu \n", ElapsedMicroseconds.QuadPart); + + fftw_destroy_plan(plan); + } + + return 0; +} diff --git a/glsl/clear.frag.glsl b/glsl/clear.frag.glsl deleted file mode 100644 index b4e4ff3..0000000 --- a/glsl/clear.frag.glsl +++ /dev/null @@ -1,12 +0,0 @@ -#version 100 -#extension GL_EXT_draw_buffers: enable -precision highp float; -precision highp int; - -#define NUM_GBUFFERS 4 - -void main() { - for (int i = 0; i < NUM_GBUFFERS; i++) { - gl_FragData[i] = vec4(0.0); - } -} diff --git a/glsl/copy.frag.glsl b/glsl/copy.frag.glsl deleted file mode 100644 index 823ebcd..0000000 --- a/glsl/copy.frag.glsl +++ /dev/null @@ -1,20 +0,0 @@ -#version 100 -#extension GL_EXT_draw_buffers: enable -precision highp float; -precision highp int; - -uniform sampler2D u_colmap; -uniform sampler2D u_normap; - -varying vec3 v_position; -varying vec3 v_normal; -varying vec2 v_uv; - -void main() { - // TODO: copy values into gl_FragData[0], [1], etc. - // You can use the GLSL texture2D function to access the textures using - // the UV in v_uv. - - // this gives you the idea - // gl_FragData[0] = vec4( v_position, 1.0 ); -} diff --git a/glsl/copy.vert.glsl b/glsl/copy.vert.glsl deleted file mode 100644 index ec14e69..0000000 --- a/glsl/copy.vert.glsl +++ /dev/null @@ -1,21 +0,0 @@ -#version 100 -#extension GL_EXT_draw_buffers: enable -precision highp float; -precision highp int; - -uniform mat4 u_cameraMat; - -attribute vec3 a_position; -attribute vec3 a_normal; -attribute vec2 a_uv; - -varying vec3 v_position; -varying vec3 v_normal; -varying vec2 v_uv; - -void main() { - gl_Position = u_cameraMat * vec4(a_position, 1.0); - v_position = a_position; - v_normal = a_normal; - v_uv = a_uv; -} diff --git a/glsl/deferred/ambient.frag.glsl b/glsl/deferred/ambient.frag.glsl deleted file mode 100644 index 1fd4647..0000000 --- a/glsl/deferred/ambient.frag.glsl +++ /dev/null @@ -1,27 +0,0 @@ - -#version 100 -precision highp float; -precision highp int; - -#define NUM_GBUFFERS 4 - -uniform sampler2D u_gbufs[NUM_GBUFFERS]; -uniform sampler2D u_depth; - -varying vec2 v_uv; - -void main() { - vec4 gb0 = texture2D(u_gbufs[0], v_uv); - vec4 gb1 = texture2D(u_gbufs[1], v_uv); - vec4 gb2 = texture2D(u_gbufs[2], v_uv); - vec4 gb3 = texture2D(u_gbufs[3], v_uv); - float depth = texture2D(u_depth, v_uv).x; - // TODO: Extract needed properties from the g-buffers into local variables - - if (depth == 1.0) { - gl_FragColor = vec4(0, 0, 0, 0); // set alpha to 0 - return; - } - - gl_FragColor = vec4(0.1, 0.1, 0.1, 1); // TODO: replace this -} diff --git a/glsl/deferred/blinnphong-pointlight.frag.glsl b/glsl/deferred/blinnphong-pointlight.frag.glsl deleted file mode 100644 index b24a54a..0000000 --- a/glsl/deferred/blinnphong-pointlight.frag.glsl +++ /dev/null @@ -1,39 +0,0 @@ -#version 100 -precision highp float; -precision highp int; - -#define NUM_GBUFFERS 4 - -uniform vec3 u_lightCol; -uniform vec3 u_lightPos; -uniform float u_lightRad; -uniform sampler2D u_gbufs[NUM_GBUFFERS]; -uniform sampler2D u_depth; - -varying vec2 v_uv; - -vec3 applyNormalMap(vec3 geomnor, vec3 normap) { - normap = normap * 2.0 - 1.0; - vec3 up = normalize(vec3(0.001, 1, 0.001)); - vec3 surftan = normalize(cross(geomnor, up)); - vec3 surfbinor = cross(geomnor, surftan); - return normap.y * surftan + normap.x * surfbinor + normap.z * geomnor; -} - -void main() { - vec4 gb0 = texture2D(u_gbufs[0], v_uv); - vec4 gb1 = texture2D(u_gbufs[1], v_uv); - vec4 gb2 = texture2D(u_gbufs[2], v_uv); - vec4 gb3 = texture2D(u_gbufs[3], v_uv); - float depth = texture2D(u_depth, v_uv).x; - // TODO: Extract needed properties from the g-buffers into local variables - - // If nothing was rendered to this pixel, set alpha to 0 so that the - // postprocessing step can render the sky color. - if (depth == 1.0) { - gl_FragColor = vec4(0, 0, 0, 0); - return; - } - - gl_FragColor = vec4(0, 0, 1, 1); // TODO: perform lighting calculations -} diff --git a/glsl/deferred/debug.frag.glsl b/glsl/deferred/debug.frag.glsl deleted file mode 100644 index 007466f..0000000 --- a/glsl/deferred/debug.frag.glsl +++ /dev/null @@ -1,53 +0,0 @@ -#version 100 -precision highp float; -precision highp int; - -#define NUM_GBUFFERS 4 - -uniform int u_debug; -uniform sampler2D u_gbufs[NUM_GBUFFERS]; -uniform sampler2D u_depth; - -varying vec2 v_uv; - -const vec4 SKY_COLOR = vec4(0.66, 0.73, 1.0, 1.0); - -vec3 applyNormalMap(vec3 geomnor, vec3 normap) { - normap = normap * 2.0 - 1.0; - vec3 up = normalize(vec3(0.001, 1, 0.001)); - vec3 surftan = normalize(cross(geomnor, up)); - vec3 surfbinor = cross(geomnor, surftan); - return normap.y * surftan + normap.x * surfbinor + normap.z * geomnor; -} - -void main() { - vec4 gb0 = texture2D(u_gbufs[0], v_uv); - vec4 gb1 = texture2D(u_gbufs[1], v_uv); - vec4 gb2 = texture2D(u_gbufs[2], v_uv); - vec4 gb3 = texture2D(u_gbufs[3], v_uv); - float depth = texture2D(u_depth, v_uv).x; - // TODO: Extract needed properties from the g-buffers into local variables - // These definitions are suggested for starting out, but you will probably want to change them. - vec3 pos = gb0.xyz; // World-space position - vec3 geomnor = gb1.xyz; // Normals of the geometry as defined, without normal mapping - vec3 colmap = gb2.rgb; // The color map - unlit "albedo" (surface color) - vec3 normap = gb3.xyz; // The raw normal map (normals relative to the surface they're on) - vec3 nor = applyNormalMap (geomnor, normap); // The true normals as we want to light them - with the normal map applied to the geometry normals (applyNormalMap above) - - // TODO: uncomment - if (u_debug == 0) { - gl_FragColor = vec4(vec3(depth), 1.0); - } else if (u_debug == 1) { - // gl_FragColor = vec4(abs(pos) * 0.1, 1.0); - } else if (u_debug == 2) { - // gl_FragColor = vec4(abs(geomnor), 1.0); - } else if (u_debug == 3) { - // gl_FragColor = vec4(colmap, 1.0); - } else if (u_debug == 4) { - // gl_FragColor = vec4(normap, 1.0); - } else if (u_debug == 5) { - // gl_FragColor = vec4(abs(nor), 1.0); - } else { - gl_FragColor = vec4(1, 0, 1, 1); - } -} diff --git a/glsl/post/one.frag.glsl b/glsl/post/one.frag.glsl deleted file mode 100644 index 94191cd..0000000 --- a/glsl/post/one.frag.glsl +++ /dev/null @@ -1,20 +0,0 @@ -#version 100 -precision highp float; -precision highp int; - -uniform sampler2D u_color; - -varying vec2 v_uv; - -const vec4 SKY_COLOR = vec4(0.01, 0.14, 0.42, 1.0); - -void main() { - vec4 color = texture2D(u_color, v_uv); - - if (color.a == 0.0) { - gl_FragColor = SKY_COLOR; - return; - } - - gl_FragColor = color; -} diff --git a/glsl/quad.vert.glsl b/glsl/quad.vert.glsl deleted file mode 100644 index a6ac319..0000000 --- a/glsl/quad.vert.glsl +++ /dev/null @@ -1,12 +0,0 @@ -#version 100 -precision highp float; -precision highp int; - -attribute vec3 a_position; - -varying vec2 v_uv; - -void main() { - gl_Position = vec4(a_position, 1.0); - v_uv = a_position.xy * 0.5 + 0.5; -} diff --git a/glsl/red.frag.glsl b/glsl/red.frag.glsl deleted file mode 100644 index f8ef1ec..0000000 --- a/glsl/red.frag.glsl +++ /dev/null @@ -1,7 +0,0 @@ -#version 100 -precision highp float; -precision highp int; - -void main() { - gl_FragColor = vec4(1, 0, 0, 1); -} diff --git a/img/Fourier_unit_pulse.png b/img/Fourier_unit_pulse.png new file mode 100644 index 0000000..4f1d573 Binary files /dev/null and b/img/Fourier_unit_pulse.png differ diff --git a/img/blocksizes.png b/img/blocksizes.png new file mode 100644 index 0000000..fe3469d Binary files /dev/null and b/img/blocksizes.png differ diff --git a/img/butterfly.png b/img/butterfly.png new file mode 100644 index 0000000..9ead313 Binary files /dev/null and b/img/butterfly.png differ diff --git a/img/correctbutterfly.png b/img/correctbutterfly.png new file mode 100644 index 0000000..8a1c90f Binary files /dev/null and b/img/correctbutterfly.png differ diff --git a/img/dft.png b/img/dft.png new file mode 100644 index 0000000..b433c35 Binary files /dev/null and b/img/dft.png differ diff --git a/img/gpugems.png b/img/gpugems.png new file mode 100644 index 0000000..7aadede Binary files /dev/null and b/img/gpugems.png differ diff --git a/img/implementations.png b/img/implementations.png new file mode 100644 index 0000000..6076ac8 Binary files /dev/null and b/img/implementations.png differ diff --git a/img/properties.png b/img/properties.png new file mode 100644 index 0000000..21db91d Binary files /dev/null and b/img/properties.png differ diff --git a/index.html b/index.html deleted file mode 100644 index 7ada197..0000000 --- a/index.html +++ /dev/null @@ -1,111 +0,0 @@ - - - - CIS 565 WebGL Deferred Shading - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- DEBUG MODE! - (Disable before measuring performance.) -
- -
-
-
-
-
- - diff --git a/js/deferredRender.js b/js/deferredRender.js deleted file mode 100644 index bb3edd4..0000000 --- a/js/deferredRender.js +++ /dev/null @@ -1,272 +0,0 @@ -(function() { - 'use strict'; - // deferredSetup.js must be loaded first - - R.deferredRender = function(state) { - if (!aborted && ( - !R.progCopy || - !R.progRed || - !R.progClear || - !R.prog_Ambient || - !R.prog_BlinnPhong_PointLight || - !R.prog_Debug || - !R.progPost1)) { - console.log('waiting for programs to load...'); - return; - } - - // Move the R.lights - for (var i = 0; i < R.lights.length; i++) { - // OPTIONAL TODO: Edit if you want to change how lights move - var mn = R.light_min[1]; - var mx = R.light_max[1]; - R.lights[i].pos[1] = (R.lights[i].pos[1] + R.light_dt - mn + mx) % mx + mn; - } - - // Execute deferred shading pipeline - - // CHECKITOUT: START HERE! You can even uncomment this: - //debugger; - - { // TODO: this block should be removed after testing renderFullScreenQuad - gl.bindFramebuffer(gl.FRAMEBUFFER, null); - // TODO: Implement/test renderFullScreenQuad first - renderFullScreenQuad(R.progRed); - return; - } - - R.pass_copy.render(state); - - if (cfg && cfg.debugView >= 0) { - // Do a debug render instead of a regular render - // Don't do any post-processing in debug mode - R.pass_debug.render(state); - } else { - // * Deferred pass and postprocessing pass(es) - // TODO: uncomment these - // R.pass_deferred.render(state); - // R.pass_post1.render(state); - - // OPTIONAL TODO: call more postprocessing passes, if any - } - }; - - /** - * 'copy' pass: Render into g-buffers - */ - R.pass_copy.render = function(state) { - // * Bind the framebuffer R.pass_copy.fbo - // TODO: uncomment - // gl.bindFramebuffer(gl.FRAMEBUFFER,R.pass_copy.fbo); - - - // * Clear screen using R.progClear - // TODO: uncomment - // renderFullScreenQuad(R.progClear); - - // * Clear depth buffer to value 1.0 using gl.clearDepth and gl.clear - // TODO: uncomment - // gl.clearDepth(1.0); - // gl.clear(gl.DEPTH_BUFFER_BIT); - - // * "Use" the program R.progCopy.prog - // TODO: uncomment - // gl.useProgram(R.progCopy.prog); - - // TODO: Go write code in glsl/copy.frag.glsl - - var m = state.cameraMat.elements; - // * Upload the camera matrix m to the uniform R.progCopy.u_cameraMat - // using gl.uniformMatrix4fv - // TODO: uncomment - // gl.uniformMatrix4fv(R.progCopy.u_cameraMat, false, m); - - // * Draw the scene - // TODO: uncomment - // drawScene(state); - }; - - var drawScene = function(state) { - for (var i = 0; i < state.models.length; i++) { - var m = state.models[i]; - - // If you want to render one model many times, note: - // readyModelForDraw only needs to be called once. - readyModelForDraw(R.progCopy, m); - - drawReadyModel(m); - } - }; - - R.pass_debug.render = function(state) { - // * Unbind any framebuffer, so we can write to the screen - // TODO: uncomment - // gl.bindFramebuffer(gl.FRAMEBUFFER, null); - - // * Bind/setup the debug "lighting" pass - // * Tell shader which debug view to use - // TODO: uncomment - // bindTexturesForLightPass(R.prog_Debug); - // gl.uniform1i(R.prog_Debug.u_debug, cfg.debugView); - - // * Render a fullscreen quad to perform shading on - // TODO: uncomment - // renderFullScreenQuad(R.prog_Debug); - }; - - /** - * 'deferred' pass: Add lighting results for each individual light - */ - R.pass_deferred.render = function(state) { - // * Bind R.pass_deferred.fbo to write into for later postprocessing - gl.bindFramebuffer(gl.FRAMEBUFFER, R.pass_deferred.fbo); - - // * Clear depth to 1.0 and color to black - gl.clearColor(0.0, 0.0, 0.0, 0.0); - gl.clearDepth(1.0); - gl.clear(gl.COLOR_BUFFER_BIT | gl.DEPTH_BUFFER_BIT); - - // * _ADD_ together the result of each lighting pass - - // Enable blending and use gl.blendFunc to blend with: - // color = 1 * src_color + 1 * dst_color - // Here is a wonderful demo of showing how blend function works: - // http://mrdoob.github.io/webgl-blendfunctions/blendfunc.html - // TODO: uncomment - // gl.enable(gl.BLEND); - // gl.blendEquation( gl.FUNC_ADD ); - // gl.blendFunc(gl.ONE,gl.ONE); - - // * Bind/setup the ambient pass, and render using fullscreen quad - bindTexturesForLightPass(R.prog_Ambient); - renderFullScreenQuad(R.prog_Ambient); - - // * Bind/setup the Blinn-Phong pass, and render using fullscreen quad - bindTexturesForLightPass(R.prog_BlinnPhong_PointLight); - - // TODO: add a loop here, over the values in R.lights, which sets the - // uniforms R.prog_BlinnPhong_PointLight.u_lightPos/Col/Rad etc., - // then does renderFullScreenQuad(R.prog_BlinnPhong_PointLight). - - // TODO: In the lighting loop, use the scissor test optimization - // Enable gl.SCISSOR_TEST, render all lights, then disable it. - // - // getScissorForLight returns null if the scissor is off the screen. - // Otherwise, it returns an array [xmin, ymin, width, height]. - // - // var sc = getScissorForLight(state.viewMat, state.projMat, light); - - // Disable blending so that it doesn't affect other code - gl.disable(gl.BLEND); - }; - - var bindTexturesForLightPass = function(prog) { - gl.useProgram(prog.prog); - - // * Bind all of the g-buffers and depth buffer as texture uniform - // inputs to the shader - for (var i = 0; i < R.NUM_GBUFFERS; i++) { - gl.activeTexture(gl['TEXTURE' + i]); - gl.bindTexture(gl.TEXTURE_2D, R.pass_copy.gbufs[i]); - gl.uniform1i(prog.u_gbufs[i], i); - } - gl.activeTexture(gl['TEXTURE' + R.NUM_GBUFFERS]); - gl.bindTexture(gl.TEXTURE_2D, R.pass_copy.depthTex); - gl.uniform1i(prog.u_depth, R.NUM_GBUFFERS); - }; - - /** - * 'post1' pass: Perform (first) pass of post-processing - */ - R.pass_post1.render = function(state) { - // * Unbind any existing framebuffer (if there are no more passes) - gl.bindFramebuffer(gl.FRAMEBUFFER, null); - - // * Clear the framebuffer depth to 1.0 - gl.clearDepth(1.0); - gl.clear(gl.DEPTH_BUFFER_BIT); - - // * Bind the postprocessing shader program - gl.useProgram(R.progPost1.prog); - - // * Bind the deferred pass's color output as a texture input - // Set gl.TEXTURE0 as the gl.activeTexture unit - // TODO: uncomment - // gl.activeTexture(gl.TEXTURE0); - - // Bind the TEXTURE_2D, R.pass_deferred.colorTex to the active texture unit - // TODO: uncomment - // gl.bindTexture(gl.TEXTURE_2D, R.pass_deferred.colorTex); - - // Configure the R.progPost1.u_color uniform to point at texture unit 0 - gl.uniform1i(R.progPost1.u_color, 0); - - // * Render a fullscreen quad to perform shading on - renderFullScreenQuad(R.progPost1); - }; - - var renderFullScreenQuad = (function() { - // The variables in this function are private to the implementation of - // renderFullScreenQuad. They work like static local variables in C++. - - // Create an array of floats, where each set of 3 is a vertex position. - // You can render in normalized device coordinates (NDC) so that the - // vertex shader doesn't have to do any transformation; draw two - // triangles which cover the screen over x = -1..1 and y = -1..1. - // This array is set up to use gl.drawArrays with gl.TRIANGLE_STRIP. - var positions = new Float32Array([ - -1.0, -1.0, 0.0, - 1.0, -1.0, 0.0, - -1.0, 1.0, 0.0, - 1.0, 1.0, 0.0 - ]); - - var vbo = null; - - var init = function() { - // Create a new buffer with gl.createBuffer, and save it as vbo. - // TODO: uncomment - vbo = gl.createBuffer(); - - // Bind the VBO as the gl.ARRAY_BUFFER - // TODO: uncomment - // gl.bindBuffer(gl.ARRAY_BUFFER,vbo); - - // Upload the positions array to the currently-bound array buffer - // using gl.bufferData in static draw mode. - // TODO: uncomment - // gl.bufferData(gl.ARRAY_BUFFER,positions,gl.STATIC_DRAW); - }; - - return function(prog) { - if (!vbo) { - // If the vbo hasn't been initialized, initialize it. - init(); - } - - // Bind the program to use to draw the quad - gl.useProgram(prog.prog); - - // Bind the VBO as the gl.ARRAY_BUFFER - // TODO: uncomment - // gl.bindBuffer(gl.ARRAY_BUFFER, vbo); - - // Enable the bound buffer as the vertex attrib array for - // prog.a_position, using gl.enableVertexAttribArray - // TODO: uncomment - // gl.enableVertexAttribArray(prog.a_position); - - // Use gl.vertexAttribPointer to tell WebGL the type/layout for - // prog.a_position's access pattern. - // TODO: uncomment - // gl.vertexAttribPointer(prog.a_position, 3, gl.FLOAT, gl.FALSE, 0, 0); - - // Use gl.drawArrays (or gl.drawElements) to draw your quad. - // TODO: uncomment - // gl.drawArrays(gl.TRIANGLE_STRIP, 0, 4); - - // Unbind the array buffer. - gl.bindBuffer(gl.ARRAY_BUFFER, null); - }; - })(); -})(); diff --git a/js/deferredSetup.js b/js/deferredSetup.js deleted file mode 100644 index 65136e0..0000000 --- a/js/deferredSetup.js +++ /dev/null @@ -1,229 +0,0 @@ -(function() { - 'use strict'; - - window.R = {}; - R.pass_copy = {}; - R.pass_debug = {}; - R.pass_deferred = {}; - R.pass_post1 = {}; - R.lights = []; - - R.NUM_GBUFFERS = 4; - - /** - * Set up the deferred pipeline framebuffer objects and textures. - */ - R.deferredSetup = function() { - setupLights(); - loadAllShaderPrograms(); - R.pass_copy.setup(); - R.pass_deferred.setup(); - }; - - // TODO: Edit if you want to change the light initial positions - R.light_min = [-14, 0, -6]; - R.light_max = [14, 18, 6]; - R.light_dt = -0.03; - R.LIGHT_RADIUS = 4.0; - R.NUM_LIGHTS = 20; // TODO: test with MORE lights! - var setupLights = function() { - Math.seedrandom(0); - - var posfn = function() { - var r = [0, 0, 0]; - for (var i = 0; i < 3; i++) { - var mn = R.light_min[i]; - var mx = R.light_max[i]; - r[i] = Math.random() * (mx - mn) + mn; - } - return r; - }; - - for (var i = 0; i < R.NUM_LIGHTS; i++) { - R.lights.push({ - pos: posfn(), - col: [ - 1 + Math.random(), - 1 + Math.random(), - 1 + Math.random()], - rad: R.LIGHT_RADIUS - }); - } - }; - - /** - * Create/configure framebuffer between "copy" and "deferred" stages - */ - R.pass_copy.setup = function() { - // * Create the FBO - R.pass_copy.fbo = gl.createFramebuffer(); - // * Create, bind, and store a depth target texture for the FBO - R.pass_copy.depthTex = createAndBindDepthTargetTexture(R.pass_copy.fbo); - - // * Create, bind, and store "color" target textures for the FBO - R.pass_copy.gbufs = []; - var attachments = []; - for (var i = 0; i < R.NUM_GBUFFERS; i++) { - var attachment = gl_draw_buffers['COLOR_ATTACHMENT' + i + '_WEBGL']; - var tex = createAndBindColorTargetTexture(R.pass_copy.fbo, attachment); - R.pass_copy.gbufs.push(tex); - attachments.push(attachment); - } - - // * Check for framebuffer errors - abortIfFramebufferIncomplete(R.pass_copy.fbo); - // * Tell the WEBGL_draw_buffers extension which FBO attachments are - // being used. (This extension allows for multiple render targets.) - gl_draw_buffers.drawBuffersWEBGL(attachments); - - gl.bindFramebuffer(gl.FRAMEBUFFER, null); - }; - - /** - * Create/configure framebuffer between "deferred" and "post1" stages - */ - R.pass_deferred.setup = function() { - // * Create the FBO - R.pass_deferred.fbo = gl.createFramebuffer(); - // * Create, bind, and store a single color target texture for the FBO - R.pass_deferred.colorTex = createAndBindColorTargetTexture( - R.pass_deferred.fbo, gl_draw_buffers.COLOR_ATTACHMENT0_WEBGL); - - // * Check for framebuffer errors - abortIfFramebufferIncomplete(R.pass_deferred.fbo); - // * Tell the WEBGL_draw_buffers extension which FBO attachments are - // being used. (This extension allows for multiple render targets.) - gl_draw_buffers.drawBuffersWEBGL([gl_draw_buffers.COLOR_ATTACHMENT0_WEBGL]); - - gl.bindFramebuffer(gl.FRAMEBUFFER, null); - }; - - /** - * Loads all of the shader programs used in the pipeline. - */ - var loadAllShaderPrograms = function() { - loadShaderProgram(gl, 'glsl/copy.vert.glsl', 'glsl/copy.frag.glsl', - function(prog) { - // Create an object to hold info about this shader program - var p = { prog: prog }; - - // Retrieve the uniform and attribute locations - p.u_cameraMat = gl.getUniformLocation(prog, 'u_cameraMat'); - p.u_colmap = gl.getUniformLocation(prog, 'u_colmap'); - p.u_normap = gl.getUniformLocation(prog, 'u_normap'); - p.a_position = gl.getAttribLocation(prog, 'a_position'); - p.a_normal = gl.getAttribLocation(prog, 'a_normal'); - p.a_uv = gl.getAttribLocation(prog, 'a_uv'); - - // Save the object into this variable for access later - R.progCopy = p; - }); - - loadShaderProgram(gl, 'glsl/quad.vert.glsl', 'glsl/red.frag.glsl', - function(prog) { - // Create an object to hold info about this shader program - R.progRed = { prog: prog }; - }); - - loadShaderProgram(gl, 'glsl/quad.vert.glsl', 'glsl/clear.frag.glsl', - function(prog) { - // Create an object to hold info about this shader program - R.progClear = { prog: prog }; - }); - - loadDeferredProgram('ambient', function(p) { - // Save the object into this variable for access later - R.prog_Ambient = p; - }); - - loadDeferredProgram('blinnphong-pointlight', function(p) { - // Save the object into this variable for access later - p.u_lightPos = gl.getUniformLocation(p.prog, 'u_lightPos'); - p.u_lightCol = gl.getUniformLocation(p.prog, 'u_lightCol'); - p.u_lightRad = gl.getUniformLocation(p.prog, 'u_lightRad'); - R.prog_BlinnPhong_PointLight = p; - }); - - loadDeferredProgram('debug', function(p) { - p.u_debug = gl.getUniformLocation(p.prog, 'u_debug'); - // Save the object into this variable for access later - R.prog_Debug = p; - }); - - loadPostProgram('one', function(p) { - p.u_color = gl.getUniformLocation(p.prog, 'u_color'); - // Save the object into this variable for access later - R.progPost1 = p; - }); - - // TODO: If you add more passes, load and set up their shader programs. - }; - - var loadDeferredProgram = function(name, callback) { - loadShaderProgram(gl, 'glsl/quad.vert.glsl', - 'glsl/deferred/' + name + '.frag.glsl', - function(prog) { - // Create an object to hold info about this shader program - var p = { prog: prog }; - - // Retrieve the uniform and attribute locations - p.u_gbufs = []; - for (var i = 0; i < R.NUM_GBUFFERS; i++) { - p.u_gbufs[i] = gl.getUniformLocation(prog, 'u_gbufs[' + i + ']'); - } - p.u_depth = gl.getUniformLocation(prog, 'u_depth'); - p.a_position = gl.getAttribLocation(prog, 'a_position'); - - callback(p); - }); - }; - - var loadPostProgram = function(name, callback) { - loadShaderProgram(gl, 'glsl/quad.vert.glsl', - 'glsl/post/' + name + '.frag.glsl', - function(prog) { - // Create an object to hold info about this shader program - var p = { prog: prog }; - - // Retrieve the uniform and attribute locations - p.a_position = gl.getAttribLocation(prog, 'a_position'); - - callback(p); - }); - }; - - var createAndBindDepthTargetTexture = function(fbo) { - var depthTex = gl.createTexture(); - gl.bindTexture(gl.TEXTURE_2D, depthTex); - gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_MAG_FILTER, gl.NEAREST); - gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_MIN_FILTER, gl.NEAREST); - gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_WRAP_S, gl.CLAMP_TO_EDGE); - gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_WRAP_T, gl.CLAMP_TO_EDGE); - gl.texImage2D( - gl.TEXTURE_2D, 0, gl.DEPTH_COMPONENT, width, height, 0, - gl.DEPTH_COMPONENT, gl.UNSIGNED_SHORT, null); - gl.bindTexture(gl.TEXTURE_2D, null); - - gl.bindFramebuffer(gl.FRAMEBUFFER, fbo); - gl.framebufferTexture2D( - gl.FRAMEBUFFER, gl.DEPTH_ATTACHMENT, gl.TEXTURE_2D, depthTex, 0); - - return depthTex; - }; - - var createAndBindColorTargetTexture = function(fbo, attachment) { - var tex = gl.createTexture(); - gl.bindTexture(gl.TEXTURE_2D, tex); - gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_MAG_FILTER, gl.NEAREST); - gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_MIN_FILTER, gl.NEAREST); - gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_WRAP_S, gl.CLAMP_TO_EDGE); - gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_WRAP_T, gl.CLAMP_TO_EDGE); - gl.texImage2D(gl.TEXTURE_2D, 0, gl.RGBA, width, height, 0, gl.RGBA, gl.FLOAT, null); - gl.bindTexture(gl.TEXTURE_2D, null); - - gl.bindFramebuffer(gl.FRAMEBUFFER, fbo); - gl.framebufferTexture2D(gl.FRAMEBUFFER, attachment, gl.TEXTURE_2D, tex, 0); - - return tex; - }; -})(); diff --git a/js/framework.js b/js/framework.js deleted file mode 100644 index 4f944ee..0000000 --- a/js/framework.js +++ /dev/null @@ -1,333 +0,0 @@ -var gl, gl_draw_buffers; -var width, height; - -(function() { - 'use strict'; - - var canvas, renderer, scene, camera, controls, stats; - var models = []; - - var cameraMat = new THREE.Matrix4(); - - var render = function() { - camera.updateMatrixWorld(); - camera.matrixWorldInverse.getInverse(camera.matrixWorld); - cameraMat.multiplyMatrices(camera.projectionMatrix, camera.matrixWorldInverse); - R.deferredRender({ - cameraMat: cameraMat, - projMat: camera.projectionMatrix, - viewMat: camera.matrixWorldInverse, - cameraPos: camera.position, - models: models - }); - }; - - var update = function() { - controls.update(); - stats.end(); - stats.begin(); - render(); - if (!aborted) { - requestAnimationFrame(update); - } - }; - - var resize = function() { - camera.aspect = width / height; - camera.updateProjectionMatrix(); - renderer.setSize(width, height); - render(); - }; - - var initExtensions = function() { - var extensions = gl.getSupportedExtensions(); - console.log(extensions); - - var reqd = [ - 'OES_texture_float', - 'OES_texture_float_linear', - 'WEBGL_depth_texture', - 'WEBGL_draw_buffers' - ]; - for (var i = 0; i < reqd.length; i++) { - var e = reqd[i]; - if (extensions.indexOf(e) < 0) { - abort('unable to load extension: ' + e); - } - } - - gl.getExtension('OES_texture_float'); - gl.getExtension('OES_texture_float_linear'); - gl.getExtension('WEBGL_depth_texture'); - - gl_draw_buffers = gl.getExtension('WEBGL_draw_buffers'); - var maxdb = gl.getParameter(gl_draw_buffers.MAX_DRAW_BUFFERS_WEBGL); - console.log('MAX_DRAW_BUFFERS_WEBGL: ' + maxdb); - }; - - var init = function() { - // TODO: For performance measurements, disable debug mode! - var debugMode = true; - - canvas = document.getElementById('canvas'); - renderer = new THREE.WebGLRenderer({ - canvas: canvas, - preserveDrawingBuffer: debugMode - }); - gl = renderer.context; - - if (debugMode) { - $('#debugmodewarning').css('display', 'block'); - var throwOnGLError = function(err, funcName, args) { - abort(WebGLDebugUtils.glEnumToString(err) + - " was caused by call to: " + funcName); - }; - gl = WebGLDebugUtils.makeDebugContext(gl, throwOnGLError); - } - - initExtensions(); - - stats = new Stats(); - stats.setMode(1); // 0: fps, 1: ms, 2: mb - stats.domElement.style.position = 'absolute'; - stats.domElement.style.left = '0px'; - stats.domElement.style.top = '0px'; - document.body.appendChild(stats.domElement); - - scene = new THREE.Scene(); - - width = canvas.width; - height = canvas.height; - camera = new THREE.PerspectiveCamera( - 45, // Field of view - width / height, // Aspect ratio - 1.0, // Near plane - 100 // Far plane - ); - camera.position.set(-15.5, 1, -1); - - controls = new THREE.OrbitControls(camera, renderer.domElement); - controls.enableDamping = true; - controls.enableZoom = true; - controls.target.set(0, 4, 0); - controls.rotateSpeed = 0.3; - controls.zoomSpeed = 1.0; - controls.panSpeed = 2.0; - - // Add sphere geometry to the scene so it gets initialized - var sph = new THREE.Mesh(new THREE.SphereGeometry(1, 8, 6)); - scene.add(sph); - renderer.render(scene, camera); - uploadModel(sph, function(m) { - R.sphereModel = m; - }); - - // var glTFURL = 'models/glTF-duck/duck.gltf'; - var glTFURL = 'models/glTF-sponza-kai-fix/sponza.gltf'; - var glTFLoader = new MinimalGLTFLoader.glTFLoader(gl); - glTFLoader.loadGLTF(glTFURL, function (glTF) { - var curScene = glTF.scenes[glTF.defaultScene]; - - var webGLTextures = {}; - - // temp var - var i,len; - var primitiveOrderID; - - var mesh; - var primitive; - var vertexBuffer; - var indicesBuffer; - - - // textures setting - var textureID = 0; - var textureInfo; - var samplerInfo; - var target, format, internalFormat, type; // texture info - var magFilter, minFilter, wrapS, wrapT; - var image; - var texture; - - - // temp for sponza - var colorTextureName = 'texture_color'; - var normalTextureName = 'texture_normal'; - - // textures - for (var tid in glTF.json.textures) { - - textureInfo = glTF.json.textures[tid]; - target = textureInfo.target || gl.TEXTURE_2D; - format = textureInfo.format || gl.RGBA; - internalFormat = textureInfo.format || gl.RGBA; - type = textureInfo.type || gl.UNSIGNED_BYTE; - - image = glTF.images[textureInfo.source]; - - texture = gl.createTexture(); - gl.activeTexture(gl.TEXTURE0 + textureID); - gl.bindTexture(target, texture); - - switch(target) { - case 3553: // gl.TEXTURE_2D - gl.texImage2D(target, 0, internalFormat, format, type, image); - break; - // TODO for TA - } - - // !! Sampler - // raw WebGL 1, no sampler object, set magfilter, wrapS, etc - samplerInfo = glTF.json.samplers[textureInfo.sampler]; - minFilter = samplerInfo.minFilter || gl.NEAREST_MIPMAP_LINEAR; - magFilter = samplerInfo.magFilter || gl.LINEAR; - wrapS = samplerInfo.wrapS || gl.REPEAT; - wrapT = samplerInfo.wrapT || gl.REPEAT; - gl.texParameteri(target, gl.TEXTURE_MIN_FILTER, minFilter); - gl.texParameteri(target, gl.TEXTURE_MAG_FILTER, magFilter); - gl.texParameteri(target, gl.TEXTURE_WRAP_S, wrapS); - gl.texParameteri(target, gl.TEXTURE_WRAP_T, wrapT); - if (minFilter == gl.NEAREST_MIPMAP_NEAREST || - minFilter == gl.NEAREST_MIPMAP_LINEAR || - minFilter == gl.LINEAR_MIPMAP_NEAREST || - minFilter == gl.LINEAR_MIPMAP_LINEAR ) { - gl.generateMipmap(target); - } - - - gl.bindTexture(target, null); - - webGLTextures[tid] = { - texture: texture, - target: target, - id: textureID - }; - - textureID++; - } - - - // vertex attributes - for (var mid in curScene.meshes) { - mesh = curScene.meshes[mid]; - - for (i = 0, len = mesh.primitives.length; i < len; ++i) { - primitive = mesh.primitives[i]; - - - vertexBuffer = gl.createBuffer(); - indicesBuffer = gl.createBuffer(); - - // initialize buffer - var vertices = primitive.vertexBuffer; - gl.bindBuffer(gl.ARRAY_BUFFER, vertexBuffer); - gl.bufferData(gl.ARRAY_BUFFER, vertices, gl.STATIC_DRAW); - gl.bindBuffer(gl.ARRAY_BUFFER, null); - - var indices = primitive.indices; - gl.bindBuffer(gl.ELEMENT_ARRAY_BUFFER, indicesBuffer); - gl.bufferData(gl.ELEMENT_ARRAY_BUFFER, indices, gl.STATIC_DRAW); - gl.bindBuffer(gl.ELEMENT_ARRAY_BUFFER, null); - - - var posInfo = primitive.attributes[primitive.technique.parameters['position'].semantic]; - var norInfo = primitive.attributes[primitive.technique.parameters['normal'].semantic]; - var uvInfo = primitive.attributes[primitive.technique.parameters['texcoord_0'].semantic]; - - models.push({ - gltf: primitive, - - idx: indicesBuffer, - - attributes: vertexBuffer, - posInfo: {size: posInfo.size, type: posInfo.type, stride: posInfo.stride, offset: posInfo.offset}, - norInfo: {size: norInfo.size, type: norInfo.type, stride: norInfo.stride, offset: norInfo.offset}, - uvInfo: {size: uvInfo.size, type: uvInfo.type, stride: uvInfo.stride, offset: uvInfo.offset}, - - // specific textures temp test - colmap: webGLTextures[colorTextureName].texture, - normap: webGLTextures[normalTextureName].texture - }); - - } - - } - - - - }); - - - resize(); - // renderer.render(scene, camera); - - gl.clearColor(0.5, 0.5, 0.5, 0.5); - gl.clearDepth(1.0); - gl.clear(gl.COLOR_BUFFER_BIT | gl.DEPTH_BUFFER_BIT); - - R.deferredSetup(); - - requestAnimationFrame(update); - }; - - var uploadModel = function(o, callback) { - for (var i = -1; i < o.children.length; i++) { - var c, g, idx; - if (i < 0) { - c = o; - if (!c.geometry) { - continue; - } - g = c.geometry._bufferGeometry.attributes; - idx = c.geometry._bufferGeometry.index; - } else { - c = o.children[i]; - g = c.geometry.attributes; - idx = c.geometry.index; - } - - var gposition = gl.createBuffer(); - gl.bindBuffer(gl.ARRAY_BUFFER, gposition); - gl.bufferData(gl.ARRAY_BUFFER, g.position.array, gl.STATIC_DRAW); - - var gnormal; - if (g.normal && g.normal.array) { - gnormal = gl.createBuffer(); - gl.bindBuffer(gl.ARRAY_BUFFER, gnormal); - gl.bufferData(gl.ARRAY_BUFFER, g.normal.array, gl.STATIC_DRAW); - } - - var guv; - if (g.uv && g.uv.array) { - guv = gl.createBuffer(); - gl.bindBuffer(gl.ARRAY_BUFFER, guv); - gl.bufferData(gl.ARRAY_BUFFER, g.uv.array, gl.STATIC_DRAW); - } - - if (!idx) { - idx = new Uint32Array(g.position.array.length / 3); - for (var j = 0; j < idx.length; j++) { - idx[j] = j; - } - } - - var gidx = gl.createBuffer(); - gl.bindBuffer(gl.ELEMENT_ARRAY_BUFFER, gidx); - gl.bufferData(gl.ELEMENT_ARRAY_BUFFER, idx, gl.STATIC_DRAW); - - var m = { - idx: gidx, - elemCount: idx.length, - position: gposition, - normal: gnormal, - uv: guv - }; - - if (callback) { - callback(m); - } - } - }; - - window.handle_load.push(init); -})(); diff --git a/js/main.js b/js/main.js deleted file mode 100644 index bf01b17..0000000 --- a/js/main.js +++ /dev/null @@ -1,11 +0,0 @@ -var handle_load = []; - -(function() { - 'use strict'; - - window.onload = function() { - for (var i = 0; i < handle_load.length; i++) { - handle_load[i](); - } - }; -})(); diff --git a/js/ui.js b/js/ui.js deleted file mode 100644 index abd6119..0000000 --- a/js/ui.js +++ /dev/null @@ -1,36 +0,0 @@ -var cfg; - -(function() { - 'use strict'; - - var Cfg = function() { - // TODO: Define config fields and defaults here - this.debugView = -1; - this.debugScissor = false; - this.enableEffect0 = false; - }; - - var init = function() { - cfg = new Cfg(); - - var gui = new dat.GUI(); - // TODO: Define any other possible config values - gui.add(cfg, 'debugView', { - 'None': -1, - '0 Depth': 0, - '1 Position': 1, - '2 Geometry normal': 2, - '3 Color map': 3, - '4 Normal map': 4, - '5 Surface normal': 5 - }); - gui.add(cfg, 'debugScissor'); - - var eff0 = gui.addFolder('EFFECT NAME HERE'); - eff0.open(); - eff0.add(cfg, 'enableEffect0'); - // TODO: add more effects toggles and parameters here - }; - - window.handle_load.push(init); -})(); diff --git a/js/util.js b/js/util.js deleted file mode 100644 index 8f43d38..0000000 --- a/js/util.js +++ /dev/null @@ -1,188 +0,0 @@ -window.aborted = false; -window.abort = function(s) { - 'use strict'; - var m = 'Fatal error: ' + s; - if (!aborted) { - $('#alertcontainer').css('display', 'block'); - aborted = true; - $('#alerttext').text(m); - } - console.log(m); - throw m; -}; - -window.loadTexture = (function() { - 'use strict'; - - var handleTextureLoaded = function(img, tex) { - gl.bindTexture(gl.TEXTURE_2D, tex); - gl.texImage2D(gl.TEXTURE_2D, 0, gl.RGBA, gl.RGBA, gl.UNSIGNED_BYTE, img); - gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_MAG_FILTER, gl.LINEAR); - gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_MIN_FILTER, gl.LINEAR_MIPMAP_NEAREST); - gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_WRAP_S, gl.REPEAT); - gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_WRAP_T, gl.REPEAT); - gl.generateMipmap(gl.TEXTURE_2D); - gl.bindTexture(gl.TEXTURE_2D, null); - }; - - return function(url) { - return new Promise(function(resolve){ - var prom = Promise.resolve(); - - var tex = gl.createTexture(); - var img = new Image(); - img.onload = function() { - handleTextureLoaded(img, tex); - resolve(tex); - }; - img.src = url; - }); - }; -})(); - -window.loadShaderProgram = (function() { - 'use strict'; - - var compileShader = function(gl, shaderSource, shaderType) { - var shader = gl.createShader(shaderType); - gl.shaderSource(shader, shaderSource); - gl.compileShader(shader); - if (!gl.getShaderParameter(shader, gl.COMPILE_STATUS)) { - console.error(shaderSource); - abort('shader compiler error:\n' + gl.getShaderInfoLog(shader)); - } - - return shader; - }; - - var linkShader = function(gl, vs, fs) { - var prog = gl.createProgram(); - gl.attachShader(prog, vs); - gl.attachShader(prog, fs); - gl.linkProgram(prog); - if (!gl.getProgramParameter(prog, gl.LINK_STATUS)) { - abort('shader linker error:\n' + gl.getProgramInfoLog(prog)); - } - return prog; - }; - - return function(gl, urlVS, urlFS, callback) { - return Promise.all([$.get(urlVS), $.get(urlFS)]).then( - function(results) { - var vs = results[0], fs = results[1]; - vs = compileShader(gl, vs, gl.VERTEX_SHADER); - fs = compileShader(gl, fs, gl.FRAGMENT_SHADER); - return linkShader(gl, vs, fs); - }).then(callback).catch(abort); - }; -})(); - -window.readyModelForDraw = function(prog, m) { - gl.useProgram(prog.prog); - - if (m.colmap) { - gl.activeTexture(gl.TEXTURE0); - gl.bindTexture(gl.TEXTURE_2D, m.colmap); - gl.uniform1i(prog.u_colmap, 0); - } - - if (m.normap) { - gl.activeTexture(gl.TEXTURE1); - gl.bindTexture(gl.TEXTURE_2D, m.normap); - gl.uniform1i(prog.u_normap, 1); - } - - gl.bindBuffer(gl.ARRAY_BUFFER, m.attributes); - - gl.enableVertexAttribArray(prog.a_position); - gl.vertexAttribPointer(prog.a_position, m.posInfo.size, m.posInfo.type, false, m.posInfo.stride, m.posInfo.offset); - - gl.enableVertexAttribArray(prog.a_normal); - gl.vertexAttribPointer(prog.a_normal, m.norInfo.size, m.norInfo.type, false, m.norInfo.stride, m.norInfo.offset); - - gl.enableVertexAttribArray(prog.a_uv); - gl.vertexAttribPointer(prog.a_uv, m.uvInfo.size, m.uvInfo.type, false, m.uvInfo.stride, m.uvInfo.offset); - - gl.bindBuffer(gl.ELEMENT_ARRAY_BUFFER, m.idx); -}; - -window.drawReadyModel = function(m) { - // TODO for TA in future: matrix transform for multiple hierachy gltf models - // reference: https://github.com/CIS565-Fall-2016/Project5A-WebGL-Forward-Plus-Shading-with-glTF/blob/master/js/forwardPlusRenderer/forwardPlusRenderer.js#L201 - - gl.drawElements(m.gltf.mode, m.gltf.indices.length, m.gltf.indicesComponentType, 0); -}; - -window.getScissorForLight = (function() { - // Pre-allocate for performance - avoids additional allocation - var a = new THREE.Vector4(0, 0, 0, 0); - var b = new THREE.Vector4(0, 0, 0, 0); - var minpt = new THREE.Vector2(0, 0); - var maxpt = new THREE.Vector2(0, 0); - var ret = [0, 0, 0, 0]; - - return function(view, proj, l) { - // front bottom-left corner of sphere's bounding cube - a.fromArray(l.pos); - a.w = 1; - a.applyMatrix4(view); - a.x -= l.rad; - a.y -= l.rad; - a.z += l.rad; - a.applyMatrix4(proj); - a.divideScalar(a.w); - - // front bottom-left corner of sphere's bounding cube - b.fromArray(l.pos); - b.w = 1; - b.applyMatrix4(view); - b.x += l.rad; - b.y += l.rad; - b.z += l.rad; - b.applyMatrix4(proj); - b.divideScalar(b.w); - - minpt.set(Math.max(-1, a.x), Math.max(-1, a.y)); - maxpt.set(Math.min( 1, b.x), Math.min( 1, b.y)); - - if (maxpt.x < -1 || 1 < minpt.x || - maxpt.y < -1 || 1 < minpt.y) { - return null; - } - - minpt.addScalar(1.0); minpt.multiplyScalar(0.5); - maxpt.addScalar(1.0); maxpt.multiplyScalar(0.5); - - ret[0] = Math.round(width * minpt.x); - ret[1] = Math.round(height * minpt.y); - ret[2] = Math.round(width * (maxpt.x - minpt.x)); - ret[3] = Math.round(height * (maxpt.y - minpt.y)); - return ret; - }; -})(); - -window.abortIfFramebufferIncomplete = function(fbo) { - gl.bindFramebuffer(gl.FRAMEBUFFER, fbo); - var fbstatus = gl.checkFramebufferStatus(gl.FRAMEBUFFER); - if (fbstatus !== gl.FRAMEBUFFER_COMPLETE) { - abort('framebuffer incomplete: ' + WebGLDebugUtils.glEnumToString(fbstatus)); - } -}; - -window.downloadCanvas = (function() { - var downloadURI = function(uri, name) { - var link = document.createElement('a'); - link.download = name; - link.href = uri; - document.body.appendChild(link); - link.click(); - document.body.removeChild(link); - }; - - return function() { - var canvas = document.getElementById('canvas'); - var time = Date.now(); - var img = canvas.toDataURL('image/png'); - downloadURI(img, 'deferred-' + time + '.png'); - }; -})(); diff --git a/models/glTF-duck/Duck.bin b/models/glTF-duck/Duck.bin deleted file mode 100644 index c5fa1b8..0000000 Binary files a/models/glTF-duck/Duck.bin and /dev/null differ diff --git a/models/glTF-duck/Duck.gltf b/models/glTF-duck/Duck.gltf deleted file mode 100644 index 051777f..0000000 --- a/models/glTF-duck/Duck.gltf +++ /dev/null @@ -1,362 +0,0 @@ -{ - "accessors": { - "accessor_21": { - "bufferView": "bufferView_29", - "byteOffset": 0, - "byteStride": 0, - "componentType": 5123, - "count": 12636, - "type": "SCALAR" - }, - "accessor_23": { - "bufferView": "bufferView_30", - "byteOffset": 0, - "byteStride": 12, - "componentType": 5126, - "count": 2399, - "max": [ - 0.961799, - 1.6397, - 0.539252 - ], - "min": [ - -0.692985, - 0.0992937, - -0.613282 - ], - "type": "VEC3" - }, - "accessor_25": { - "bufferView": "bufferView_30", - "byteOffset": 28788, - "byteStride": 12, - "componentType": 5126, - "count": 2399, - "max": [ - 0.999599, - 0.999581, - 0.998436 - ], - "min": [ - -0.999084, - -1, - -0.999832 - ], - "type": "VEC3" - }, - "accessor_27": { - "bufferView": "bufferView_30", - "byteOffset": 57576, - "byteStride": 8, - "componentType": 5126, - "count": 2399, - "max": [ - 0.983346, - 0.980037 - ], - "min": [ - 0.026409, - 0.019963 - ], - "type": "VEC2" - } - }, - "animations": {}, - "asset": { - "generator": "collada2gltf@027f74366341d569dea42e9a68b7104cc3892054", - "premultipliedAlpha": true, - "profile": { - "api": "WebGL", - "version": "1.0.2" - }, - "version": "1.0" - }, - "bufferViews": { - "bufferView_29": { - "buffer": "Duck", - "byteLength": 25272, - "byteOffset": 0, - "target": 34963 - }, - "bufferView_30": { - "buffer": "Duck", - "byteLength": 76768, - "byteOffset": 25272, - "target": 34962 - } - }, - "buffers": { - "Duck": { - "byteLength": 102040, - "type": "arraybuffer", - "uri": "Duck.bin" - } - }, - "cameras": { - "cameraShape1": { - "name": "cameraShape1", - "perspective": { - "aspectRatio": 1.5, - "yfov": 0.660593, - "zfar": 100, - "znear": 0.01 - }, - "type": "perspective" - } - }, - "images": { - "file2": { - "name": "file2", - "uri": "DuckCM.png" - } - }, - "materials": { - "blinn3-fx": { - "name": "blinn3", - "technique": "technique0", - "values": { - "ambient": [ - 0, - 0, - 0, - 1 - ], - "diffuse": "texture_file2", - "emission": [ - 0, - 0, - 0, - 1 - ], - "shininess": 38.4, - "specular": [ - 0, - 0, - 0, - 1 - ] - } - } - }, - "meshes": { - "LOD3spShape-lib": { - "name": "LOD3spShape", - "primitives": [ - { - "attributes": { - "NORMAL": "accessor_25", - "POSITION": "accessor_23", - "TEXCOORD_0": "accessor_27" - }, - "indices": "accessor_21", - "material": "blinn3-fx", - "mode": 4 - } - ] - } - }, - "nodes": { - "LOD3sp": { - "children": [], - "matrix": [ - 1, - 0, - 0, - 0, - 0, - 1, - 0, - 0, - 0, - 0, - 1, - 0, - 0, - 0, - 0, - 1 - ], - "meshes": [ - "LOD3spShape-lib" - ], - "name": "LOD3sp" - }, - "camera1": { - "camera": "cameraShape1", - "children": [], - "matrix": [ - -0.728969, - 0, - -0.684547, - 0, - -0.425205, - 0.783693, - 0.452797, - 0, - 0.536475, - 0.621148, - -0.571288, - 0, - 4.00113, - 4.63264, - -4.31078, - 1 - ], - "name": "camera1" - }, - "directionalLight1": { - "children": [], - "matrix": [ - -0.954692, - 0.218143, - -0.202429, - 0, - 0.014672, - 0.713885, - 0.700109, - 0, - 0.297235, - 0.665418, - -0.684741, - 0, - 1.48654, - 1.83672, - -2.92179, - 1 - ], - "name": "directionalLight1" - } - }, - "programs": { - "program_0": { - "attributes": [ - "a_normal", - "a_position", - "a_texcoord0" - ], - "fragmentShader": "Duck0FS", - "vertexShader": "Duck0VS" - } - }, - "samplers": { - "sampler_0": { - "magFilter": 9729, - "minFilter": 9987, - "wrapS": 10497, - "wrapT": 10497 - } - }, - "scene": "defaultScene", - "scenes": { - "defaultScene": { - "nodes": [ - "LOD3sp", - "camera1", - "directionalLight1" - ] - } - }, - "shaders": { - "Duck0FS": { - "type": 35632, - "uri": "Duck0FS.glsl" - }, - "Duck0VS": { - "type": 35633, - "uri": "Duck0VS.glsl" - } - }, - "skins": {}, - "techniques": { - "technique0": { - "attributes": { - "a_normal": "normal", - "a_position": "position", - "a_texcoord0": "texcoord0" - }, - "parameters": { - "ambient": { - "type": 35666 - }, - "diffuse": { - "type": 35678 - }, - "emission": { - "type": 35666 - }, - "light0Color": { - "type": 35665, - "value": [ - 1, - 1, - 1 - ] - }, - "light0Transform": { - "node": "directionalLight1", - "semantic": "MODELVIEW", - "type": 35676 - }, - "modelViewMatrix": { - "semantic": "MODELVIEW", - "type": 35676 - }, - "normal": { - "semantic": "NORMAL", - "type": 35665 - }, - "normalMatrix": { - "semantic": "MODELVIEWINVERSETRANSPOSE", - "type": 35675 - }, - "position": { - "semantic": "POSITION", - "type": 35665 - }, - "projectionMatrix": { - "semantic": "PROJECTION", - "type": 35676 - }, - "shininess": { - "type": 5126 - }, - "specular": { - "type": 35666 - }, - "texcoord0": { - "semantic": "TEXCOORD_0", - "type": 35664 - } - }, - "program": "program_0", - "states": { - "enable": [ - 2929, - 2884 - ] - }, - "uniforms": { - "u_ambient": "ambient", - "u_diffuse": "diffuse", - "u_emission": "emission", - "u_light0Color": "light0Color", - "u_light0Transform": "light0Transform", - "u_modelViewMatrix": "modelViewMatrix", - "u_normalMatrix": "normalMatrix", - "u_projectionMatrix": "projectionMatrix", - "u_shininess": "shininess", - "u_specular": "specular" - } - } - }, - "textures": { - "texture_file2": { - "format": 6408, - "internalFormat": 6408, - "sampler": "sampler_0", - "source": "file2", - "target": 3553, - "type": 5121 - } - } -} \ No newline at end of file diff --git a/models/glTF-duck/Duck0FS.glsl b/models/glTF-duck/Duck0FS.glsl deleted file mode 100644 index 1a883e6..0000000 --- a/models/glTF-duck/Duck0FS.glsl +++ /dev/null @@ -1,42 +0,0 @@ -precision highp float; -varying vec3 v_normal; -uniform vec4 u_ambient; -varying vec2 v_texcoord0; -uniform sampler2D u_diffuse; -uniform vec4 u_emission; -uniform vec4 u_specular; -uniform float u_shininess; -varying vec3 v_light0Direction; -varying vec3 v_position; -uniform vec3 u_light0Color; -void main(void) { -vec3 normal = normalize(v_normal); -vec4 color = vec4(0., 0., 0., 0.); -vec4 diffuse = vec4(0., 0., 0., 1.); -vec3 diffuseLight = vec3(0., 0., 0.); -vec4 emission; -vec4 ambient; -vec4 specular; -ambient = u_ambient; -diffuse = texture2D(u_diffuse, v_texcoord0); -emission = u_emission; -specular = u_specular; -vec3 specularLight = vec3(0., 0., 0.); -{ -float specularIntensity = 0.; -float attenuation = 1.0; -vec3 l = normalize(v_light0Direction); -vec3 viewDir = -normalize(v_position); -vec3 h = normalize(l+viewDir); -specularIntensity = max(0., pow(max(dot(normal,h), 0.) , u_shininess)) * attenuation; -specularLight += u_light0Color * specularIntensity; -diffuseLight += u_light0Color * max(dot(normal,l), 0.) * attenuation; -} -specular.xyz *= specularLight; -color.xyz += specular.xyz; -diffuse.xyz *= diffuseLight; -color.xyz += diffuse.xyz; -color.xyz += emission.xyz; -color = vec4(color.rgb * diffuse.a, diffuse.a); -gl_FragColor = color; -} diff --git a/models/glTF-duck/Duck0VS.glsl b/models/glTF-duck/Duck0VS.glsl deleted file mode 100644 index defe481..0000000 --- a/models/glTF-duck/Duck0VS.glsl +++ /dev/null @@ -1,20 +0,0 @@ -precision highp float; -attribute vec3 a_position; -attribute vec3 a_normal; -varying vec3 v_normal; -uniform mat3 u_normalMatrix; -uniform mat4 u_modelViewMatrix; -uniform mat4 u_projectionMatrix; -attribute vec2 a_texcoord0; -varying vec2 v_texcoord0; -varying vec3 v_light0Direction; -varying vec3 v_position; -uniform mat4 u_light0Transform; -void main(void) { -vec4 pos = u_modelViewMatrix * vec4(a_position,1.0); -v_normal = u_normalMatrix * a_normal; -v_texcoord0 = a_texcoord0; -v_position = pos.xyz; -v_light0Direction = mat3(u_light0Transform) * vec3(0.,0.,1.); -gl_Position = u_projectionMatrix * pos; -} diff --git a/models/glTF-duck/DuckCM.png b/models/glTF-duck/DuckCM.png deleted file mode 100644 index 62d9200..0000000 Binary files a/models/glTF-duck/DuckCM.png and /dev/null differ diff --git a/models/gltf-sponza-kai-fix/buffer_0.bin b/models/gltf-sponza-kai-fix/buffer_0.bin deleted file mode 100644 index e1a3c55..0000000 Binary files a/models/gltf-sponza-kai-fix/buffer_0.bin and /dev/null differ diff --git a/models/gltf-sponza-kai-fix/color.jpeg b/models/gltf-sponza-kai-fix/color.jpeg deleted file mode 100644 index 05248f4..0000000 Binary files a/models/gltf-sponza-kai-fix/color.jpeg and /dev/null differ diff --git a/models/gltf-sponza-kai-fix/fragmentShader0.glsl b/models/gltf-sponza-kai-fix/fragmentShader0.glsl deleted file mode 100644 index ad5d407..0000000 --- a/models/gltf-sponza-kai-fix/fragmentShader0.glsl +++ /dev/null @@ -1,43 +0,0 @@ -precision highp float; -uniform vec4 u_ambient; -uniform sampler2D u_diffuse; -uniform sampler2D u_normal; -uniform vec4 u_emission; -uniform vec4 u_specular; -uniform float u_shininess; -uniform float u_transparency; -varying vec3 v_positionEC; -varying vec3 v_normal; -varying vec2 v_texcoord_0; - -vec3 applyNormalMap(vec3 geomnor, vec3 normap) { - normap = normap * 2.0 - 1.0; - vec3 up = normalize(vec3(0.001, 1, 0.001)); - vec3 surftan = normalize(cross(geomnor, up)); - vec3 surfbinor = cross(geomnor, surftan); - return normap.y * surftan + normap.x * surfbinor + normap.z * geomnor; -} - -void main(void) { - vec3 normal = applyNormalMap(normalize(v_normal), texture2D(u_normal, v_texcoord_0).rgb); - vec4 diffuse = texture2D(u_diffuse, v_texcoord_0); - vec3 diffuseLight = vec3(0.0, 0.0, 0.0); - vec3 specular = u_specular.rgb; - vec3 specularLight = vec3(0.0, 0.0, 0.0); - vec3 emission = u_emission.rgb; - vec3 ambient = u_ambient.rgb; - vec3 viewDir = -normalize(v_positionEC); - vec3 ambientLight = vec3(0.0, 0.0, 0.0); - ambientLight += vec3(0.2, 0.2, 0.2); - vec3 l = vec3(0.0, 0.0, 1.0); - diffuseLight += vec3(1.0, 1.0, 1.0) * max(dot(normal,l), 0.); - vec3 h = normalize(l + viewDir); - float specularIntensity = max(0., pow(max(dot(normal, h), 0.), u_shininess)); - specularLight += vec3(1.0, 1.0, 1.0) * specularIntensity; - vec3 color = vec3(0.0, 0.0, 0.0); - color += diffuse.rgb * diffuseLight; - color += specular * specularLight; - color += emission; - color += ambient * ambientLight; - gl_FragColor = vec4(color * diffuse.a, diffuse.a * u_transparency); -} diff --git a/models/gltf-sponza-kai-fix/normal.png b/models/gltf-sponza-kai-fix/normal.png deleted file mode 100644 index 322412b..0000000 Binary files a/models/gltf-sponza-kai-fix/normal.png and /dev/null differ diff --git a/models/gltf-sponza-kai-fix/sponza.gltf b/models/gltf-sponza-kai-fix/sponza.gltf deleted file mode 100644 index e21c7eb..0000000 --- a/models/gltf-sponza-kai-fix/sponza.gltf +++ /dev/null @@ -1,317 +0,0 @@ -{ - "accessors": { - "accessor_index_0": { - "bufferView": "bufferView_1", - "byteOffset": 0, - "byteStride": 0, - "componentType": 5125, - "count": 199269, - "type": "SCALAR", - "min": [ - 0 - ], - "max": [ - 199268 - ] - }, - "accessor_position": { - "bufferView": "bufferView_0", - "byteOffset": 0, - "byteStride": 0, - "componentType": 5126, - "count": 148975, - "min": [ - -17.268321990966797, - -0.006653999909758568, - -7.7815141677856445 - ], - "max": [ - 17.551677703857422, - 15.55334758758545, - 7.818483829498291 - ], - "type": "VEC3" - }, - "accessor_normal": { - "bufferView": "bufferView_0", - "byteOffset": 1787700, - "byteStride": 0, - "componentType": 5126, - "count": 148975, - "type": "VEC3", - "min": [ - null, - null, - null - ], - "max": [ - null, - null, - null - ] - }, - "accessor_uv": { - "bufferView": "bufferView_0", - "byteOffset": 3575400, - "byteStride": 0, - "componentType": 5126, - "count": 148975, - "type": "VEC2", - "min": [ - -57.04376983642578, - -61.176544189453125 - ], - "max": [ - 57.97621536254883, - 62.176544189453125 - ] - } - }, - "asset": { - "generator": "OBJ2GLTF", - "premultipliedAlpha": true, - "profile": { - "api": "WebGL", - "version": "1.0" - }, - "version": "1.0" - }, - "buffers": { - "buffer_0": { - "type": "arraybuffer", - "byteLength": 5564276, - "uri": "buffer_0.bin" - } - }, - "bufferViews": { - "bufferView_0": { - "buffer": "buffer_0", - "byteLength": 4767200, - "byteOffset": 0, - "target": 34962 - }, - "bufferView_1": { - "buffer": "buffer_0", - "byteLength": 797076, - "byteOffset": 4767200, - "target": 34963 - } - }, - "images": { - "color": { - "uri": "color.jpeg" - }, - "normals": { - "uri": "normal.png" - } - }, - "materials": { - "material_lambert2SG": { - "name": "lambert2SG", - "extensions": {}, - "values": { - "ambient": [ - 0, - 0, - 0, - 1 - ], - "diffuse": "texture_color", - "normalMap": "texture_normal", - "emission": [ - 0, - 0, - 0, - 1 - ], - "specular": [ - 0, - 0, - 0, - 1 - ], - "shininess": 0, - "transparency": 1 - }, - "technique": "technique0" - } - }, - "meshes": { - "mesh_sponza": { - "name": "sponza", - "primitives": [ - { - "attributes": { - "POSITION": "accessor_position", - "NORMAL": "accessor_normal", - "TEXCOORD_0": "accessor_uv" - }, - "indices": "accessor_index_0", - "material": "material_lambert2SG", - "mode": 4 - } - ] - } - }, - "nodes": { - "rootNode": { - "children": [], - "meshes": [ - "mesh_sponza" - ], - "matrix": [ - 1, - 0, - 0, - 0, - 0, - 1, - 0, - 0, - 0, - 0, - 1, - 0, - 0, - 0, - 0, - 1 - ] - } - }, - "samplers": { - "sampler_0": { - "magFilter": 9729, - "minFilter": 9986, - "wrapS": 10497, - "wrapT": 10497 - } - }, - "scene": "scene_sponza", - "scenes": { - "scene_sponza": { - "nodes": [ - "rootNode" - ] - } - }, - "textures": { - "texture_color": { - "format": 6407, - "internalFormat": 6407, - "sampler": "sampler_0", - "source": "color", - "target": 3553, - "type": 5121 - }, - "texture_normal": { - "format": 6407, - "internalFormat": 6407, - "sampler": "sampler_0", - "source": "normals", - "target": 3553, - "type": 5121 - } - }, - "extensionsUsed": [], - "animations": {}, - "cameras": {}, - "techniques": { - "technique0": { - "attributes": { - "a_position": "position", - "a_normal": "normal", - "a_texcoord_0": "texcoord_0" - }, - "parameters": { - "modelViewMatrix": { - "semantic": "MODELVIEW", - "type": 35676 - }, - "projectionMatrix": { - "semantic": "PROJECTION", - "type": 35676 - }, - "normalMatrix": { - "semantic": "MODELVIEWINVERSETRANSPOSE", - "type": 35675 - }, - "ambient": { - "type": 35666 - }, - "diffuse": { - "type": 35678 - }, - "normalMap": { - "type": 35678 - }, - "emission": { - "type": 35666 - }, - "specular": { - "type": 35666 - }, - "shininess": { - "type": 5126 - }, - "transparency": { - "type": 5126 - }, - "position": { - "semantic": "POSITION", - "type": 35665 - }, - "normal": { - "semantic": "NORMAL", - "type": 35665 - }, - "texcoord_0": { - "semantic": "TEXCOORD_0", - "type": 35664 - } - }, - "program": "program0", - "states": { - "enable": [ - 2884, - 2929 - ] - }, - "uniforms": { - "u_modelViewMatrix": "modelViewMatrix", - "u_projectionMatrix": "projectionMatrix", - "u_normalMatrix": "normalMatrix", - "u_ambient": "ambient", - "u_diffuse": "diffuse", - "u_normal": "normalMap", - "u_emission": "emission", - "u_specular": "specular", - "u_shininess": "shininess", - "u_transparency": "transparency" - } - } - }, - "programs": { - "program0": { - "attributes": [ - "a_position", - "a_normal", - "a_texcoord_0" - ], - "fragmentShader": "fragmentShader0", - "vertexShader": "vertexShader0" - } - }, - "shaders": { - "vertexShader0": { - "type": 35633, - "uri": "vertexShader0.glsl" - }, - "fragmentShader0": { - "type": 35632, - "uri": "fragmentShader0.glsl" - } - }, - "skins": {}, - "extensions": {} -} diff --git a/models/gltf-sponza-kai-fix/vertexShader0.glsl b/models/gltf-sponza-kai-fix/vertexShader0.glsl deleted file mode 100644 index c489a55..0000000 --- a/models/gltf-sponza-kai-fix/vertexShader0.glsl +++ /dev/null @@ -1,17 +0,0 @@ -precision highp float; -uniform mat4 u_modelViewMatrix; -uniform mat4 u_projectionMatrix; -uniform mat3 u_normalMatrix; -attribute vec3 a_position; -varying vec3 v_positionEC; -attribute vec3 a_normal; -varying vec3 v_normal; -attribute vec2 a_texcoord_0; -varying vec2 v_texcoord_0; -void main(void) { - vec4 pos = u_modelViewMatrix * vec4(a_position,1.0); - v_positionEC = pos.xyz; - gl_Position = u_projectionMatrix * pos; - v_normal = u_normalMatrix * a_normal; - v_texcoord_0 = a_texcoord_0; -} diff --git a/parallel_fft/CMakeLists.txt b/parallel_fft/CMakeLists.txt new file mode 100644 index 0000000..24f9d63 --- /dev/null +++ b/parallel_fft/CMakeLists.txt @@ -0,0 +1,11 @@ +set(SOURCE_FILES + "common.h" + "common.cu" + "fft.h" + "fft.cu" + ) + +cuda_add_library(parallel_fft + ${SOURCE_FILES} + OPTIONS -arch=sm_20 + ) diff --git a/parallel_fft/common.cu b/parallel_fft/common.cu new file mode 100644 index 0000000..dd35ab4 --- /dev/null +++ b/parallel_fft/common.cu @@ -0,0 +1,52 @@ +#include "common.h" + +void checkCUDAErrorFn(const char *msg, const char *file, int line) { + cudaError_t err = cudaGetLastError(); + if (cudaSuccess == err) { + return; + } + + fprintf(stderr, "CUDA error"); + if (file) { + fprintf(stderr, " (%s:%d)", file, line); + } + fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err)); + exit(EXIT_FAILURE); +} + + +namespace StreamCompaction { +namespace Common { + +/** + * Maps an array to an array of 0s and 1s for stream compaction. Elements + * which map to 0 will be removed, and elements which map to 1 will be kept. + */ +__global__ void kernMapToBoolean(int n, int *bools, const int *idata) +{ + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (index >= n) + return; + + bools[index] = idata[index] ? 1 : 0; +} + +/** + * Performs scatter on an array. That is, for each element in idata, + * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]]. + */ +__global__ void kernScatter(int n, int *odata, + const int *idata, const int *bools, const int *indices) +{ + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (index >= n) + return; + + if (bools[index]) + odata[indices[index]] = idata[index]; +} + +} +} diff --git a/parallel_fft/common.h b/parallel_fft/common.h new file mode 100644 index 0000000..29653f4 --- /dev/null +++ b/parallel_fft/common.h @@ -0,0 +1,28 @@ +#pragma once + +#include +#include +#include +#include + +#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) +#define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) + +#define blocksize 128 + +/** + * Check for CUDA errors; print and exit if there was a problem. + */ +void checkCUDAErrorFn(const char *msg, const char *file = NULL, int line = -1); + +inline int ilog2(int x) { + int lg = 0; + while (x >>= 1) { + ++lg; + } + return lg; +} + +inline int ilog2ceil(int x) { + return ilog2(x - 1) + 1; +} \ No newline at end of file diff --git a/parallel_fft/fft.cu b/parallel_fft/fft.cu new file mode 100644 index 0000000..838e720 --- /dev/null +++ b/parallel_fft/fft.cu @@ -0,0 +1,256 @@ +#include "fft.h" + +#define blockSize 128 +#define CHECKPOINT 0 +thrust::complex * dev_isamples; +thrust::complex * dev_osamples; + +#if CHECKPOINT +void checkpoint(const char * print_me, int N, thrust::complex * buf) +{ + printf(print_me); + + for (int i = 0; i < N; ++i) + printf("%f\n", thrust::abs(buf[i])); + +} +#endif + +__host__ __device__ int ilog2_2(int x) { + int lg = 0; + while (x >>= 1) { + ++lg; + } + return lg; +} + +__host__ __device__ int ilog2ceil_2(int x) { + return ilog2_2(x - 1) + 1; +} + +void fft_init(int N) +{ + cudaMalloc((void **)&dev_isamples, N * sizeof(thrust::complex)); + checkCUDAError("cudaMalloc dev_isamples failed!"); + cudaMalloc((void **)&dev_osamples, N * sizeof(thrust::complex)); + checkCUDAError("cudaMalloc dev_osamples failed!"); +} + +void fft_free() +{ + cudaFree(dev_isamples); + cudaFree(dev_osamples); +} + +void ping_pong(thrust::complex ** a, thrust::complex ** b) +{ + thrust::complex * temp = *a; + *a = *b; + *b = temp; +} + + + + +__device__ unsigned int twiddle(unsigned int x) +{ + //strictly reverses bits. must shift shift in calling context + x = (((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1)); + x = (((x & 0xcccccccc) >> 2) | ((x & 0x33333333) << 2)); + x = (((x & 0xf0f0f0f0) >> 4) | ((x & 0x0f0f0f0f) << 4)); + x = (((x & 0xff00ff00) >> 8) | ((x & 0x00ff00ff) << 8)); + return ((x >> 16) | (x << 16)); +} + + + + +__global__ void inputScramble(int N, thrust::complex * idata, thrust::complex * odata) +{ + + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (index >= N) + return; + + //do global memory access + thrust::complex myVal = idata[index]; + + //hide latency with computation + int out_index = twiddle(index) >> (32 - ilog2ceil_2(N)); +#if CHECKPOINT + printf("iindex is %d oindex is %d\n", index, out_index, thrust::abs(myVal)); +#endif + odata[out_index] = myVal; +} + + + + +__global__ void doButterfly(int N, int stage, int numPoints, + thrust::complex * idata, thrust::complex * odata) +{ + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (index >= N) + return; + + thrust::complex point = idata[index]; + + // N/2 + int half_points = numPoints / 2; //also shift? + + // Relative index in this fourier transform + int relativeIndex = index % numPoints; + + thrust::complex point2; + + if (relativeIndex < half_points) + { + // add point + N/2 to self + point2 = idata[index + half_points]; + } + else + { + // subtract self from - N/2 + point2 = idata[index - half_points]; + point *= -1.0; + //thrust::complex exponent = (relativeIndex % half_points) * (ilog2ceil_2(N) - stage); + //point = point2 - thrust::pow(W, exponent) * point; + } + + point = point + point2; + +#if CHECKPOINT + printf("i am %d, combining with %d\n", index, relativeIndex < half_points ? index + half_points : index - half_points); + printf("half_points is %d, relativeIndex is %d\n", half_points, relativeIndex); +#endif + + odata[index] = point; +} + + + + + + +// in place multiplication of twiddle factors +__global__ void doMultiply(int N, int numPoints, thrust::complex W, thrust::complex * idata) +{ + + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (index >= N) + return; + + //do global memory access + thrust::complex myVal = idata[index]; + + int relativeIndex = index % numPoints; + + if (relativeIndex < numPoints / 2) + return; + +#if CHECKPOINT + printf("my index is %d, myVal is %f + i%f, W is %f + i%f, my exponent is %d\n", index, myVal.real(), myVal.imag(), W.real(), W.imag(), relativeIndex - numPoints / 2); +#endif + + thrust::complex exponent = thrust::complex((double)relativeIndex - (double)numPoints / 2.0f, 0.0f); + +#if CHECKPOINT + printf("my index is %d, relative index is %d, numPoints is %d, exponent is %f + i %f\n", index, relativeIndex, numPoints, exponent.real(), exponent.imag()); +#endif + + myVal *= thrust::pow(W, exponent); + +#if CHECKPOINT + thrust::complex tempResult = thrust::pow(W, exponent); + printf("my index is %d, twiddleFactor is %f + i %f, newVal is %f + i%f\n", index, tempResult.real(), tempResult.imag(), myVal.real(), myVal.imag()); +#endif + + idata[index] = myVal; +} + +/* +parallel FFT implementation + +inputs: +int N - number of samples +float * samples - pointer to array of sammples (of size N) +float * transform - pointer to array where transform should be stored. + It is safe for this to be the same as samples (i.e. in place) + +*/ + +void parallel_fft (int N, + thrust::complex * samples, + thrust::complex * transform) +{ + +#if CHECKPOINT + checkpoint("initial samples\n", N, samples); +#endif + + //allocate buffers + fft_init(N); + +#if CHECKPOINT + thrust::complex * checkpoint_buf = (thrust::complex *) calloc(N,sizeof(thrust::complex)); +#endif + + //compute numBlocks + dim3 numBlocks = (N + blockSize - 1) / blockSize; + + cudaMemcpy(dev_isamples, samples, sizeof(thrust::complex) * N, cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy sample data to device failed!"); + +#if CHECKPOINT + cudaMemcpy(checkpoint_buf, dev_isamples, N*sizeof(thrust::complex), cudaMemcpyDeviceToHost); + checkpoint("initial samples on device\n", N, checkpoint_buf); +#endif + + + //scrable inputs to reverse-binary order + inputScramble << > >(N, dev_isamples, dev_osamples); + checkCUDAError("kernel inputScramble failed!"); + +#if CHECKPOINT + cudaMemcpy(checkpoint_buf, dev_osamples, N*sizeof(thrust::complex), cudaMemcpyDeviceToHost); + checkpoint("after scramble\n", N, checkpoint_buf); +#endif + + //ping pong buffers + ping_pong(&dev_isamples, &dev_osamples); + + //Butterfly + for (int i = 0; i < ilog2ceil(N); ++i) + { + int numPoints = pow(2, i+1); + // create the W vector for this N + thrust::complex W (cos((2.0 * M_PI) / numPoints), -1.0 * sin((2.0 * M_PI) / numPoints)); + + //pre-multiply pionts by necessary twiddle factors + doMultiply << > >(N, numPoints, W, dev_isamples); + checkCUDAError("doMultiply failed!"); + + doButterfly << > >(N, i, numPoints, dev_isamples, dev_osamples); + checkCUDAError("doButterfly sample data to device failed!"); + +#if CHECKPOINT + cudaMemcpy(checkpoint_buf, dev_osamples, N*sizeof(thrust::complex), cudaMemcpyDeviceToHost); + checkpoint("after butterfly\n", N, checkpoint_buf); +#endif + + ping_pong(&dev_isamples, &dev_osamples); + } + + //copy result to output + cudaMemcpy(transform, dev_isamples, N * sizeof(thrust::complex), cudaMemcpyDeviceToHost); + + //free buffers + fft_free(); + +#if CHECKPOINT + free(checkpoint_buf); +#endif +} diff --git a/parallel_fft/fft.h b/parallel_fft/fft.h new file mode 100644 index 0000000..278cb5c --- /dev/null +++ b/parallel_fft/fft.h @@ -0,0 +1,13 @@ +#pragma once + +#include "common.h" +#include +#include +#include +#include + +#define M_PI 3.14159265358979323846 + +void parallel_fft (int N, thrust::complex * samples, thrust::complex * transform); + +void checkpoint(const char * print_me, int N, thrust::complex * buf); \ No newline at end of file diff --git a/server.bat b/server.bat deleted file mode 100644 index 098de65..0000000 --- a/server.bat +++ /dev/null @@ -1 +0,0 @@ -python server.py \ No newline at end of file diff --git a/server.py b/server.py deleted file mode 100755 index 17ff1a6..0000000 --- a/server.py +++ /dev/null @@ -1,18 +0,0 @@ -#!/usr/bin/python - -import sys -if sys.version_info.major == 2: - from SimpleHTTPServer import SimpleHTTPRequestHandler - from SocketServer import TCPServer -elif sys.version_info.major == 3: - from http.server import SimpleHTTPRequestHandler - from socketserver import TCPServer - -PORT = 10565 - -Handler = SimpleHTTPRequestHandler - -httpd = TCPServer(("", PORT), Handler) - -print("serving at port {}".format(PORT)) -httpd.serve_forever() diff --git a/src/main.cpp b/src/main.cpp new file mode 100644 index 0000000..3689c4a --- /dev/null +++ b/src/main.cpp @@ -0,0 +1,41 @@ +#include +#include +#include + +#define REAL 0 +#define IMAG 1 +#define M_PI 3.14159265358979323846 +#define NUM_POINTS 8 + +thrust::complex signal[NUM_POINTS]; +thrust::complex result[NUM_POINTS]; + +void acquire_from_somewhere( thrust::complex * signal ) { + /* Generate two sine waves of different frequencies and + * amplitudes. + */ + + int i; + for (i = 0; i < NUM_POINTS; ++i) { + double theta = (double)i / (double)NUM_POINTS * M_PI; + + signal[i] = thrust::complex (1.0 * cos(10.0 * theta) + + 0.5 * cos(25.0 * theta), + 1.0 * sin(10.0 * theta) + + 0.5 * sin(25.0 * theta)); + } +} + +int main(int argc, char* argv[]) { + //const int SIZE = 1 << 8; + //const int NPOT = NUM_POINTS - 3; + //thrust::complex a[SIZE], b[SIZE], c[SIZE]; + acquire_from_somewhere(signal); + parallel_fft(NUM_POINTS, signal, result); + + for (int i = 0; i < NUM_POINTS; ++i) { + double mag = thrust::abs(result[i]); + + printf("%f\n", mag); + } +} diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp new file mode 100644 index 0000000..f6b572f --- /dev/null +++ b/src/testing_helpers.hpp @@ -0,0 +1,61 @@ +#pragma once + +#include + +template +int cmpArrays(int n, T *a, T *b) { + for (int i = 0; i < n; i++) { + if (a[i] != b[i]) { + printf(" a[%d] = %d, b[%d] = %d\n", i, a[i], i, b[i]); + return 1; + } + } + return 0; +} + +void printDesc(const char *desc) { + printf("==== %s ====\n", desc); +} + +template +void printCmpResult(int n, T *a, T *b) { + printf(" %s \n", + cmpArrays(n, a, b) ? "FAIL VALUE" : "passed"); +} + +template +void printCmpLenResult(int n, int expN, T *a, T *b) { + if (n != expN) { + printf(" expected %d elements, got %d\n", expN, n); + } + printf(" %s \n", + (n == -1 || n != expN) ? "FAIL COUNT" : + cmpArrays(n, a, b) ? "FAIL VALUE" : "passed"); +} + +void zeroArray(int n, int *a) { + for (int i = 0; i < n; i++) { + a[i] = 0; + } +} + +void genArray(int n, int *a, int maxval) { + srand(0); + + for (int i = 0; i < n; i++) { + a[i] = rand() % maxval; + } +} + +void printArray(int n, int *a, bool abridged = false) { + printf(" [ "); + for (int i = 0; i < n; i++) { + if (abridged && i + 2 == 15 && n > 16) { + i = n - 2; + printf("... "); + } + printf("%3d ", a[i]); + } + printf("]\n"); +} +